arm_compute v20.08

commit: 18b685f5d09ba07aa54e050f881a0befe3e36047 [log] [tgz]
author: Jenkins <bsgcomp@arm.com> Fri Aug 21 10:26:22 2020 +0100
committer: Jenkins <bsgcomp@arm.com> Fri Aug 21 10:26:22 2020 +0100
tree: 63e58eb2b62c1ebc77c336ba0043518d6f6daa8f
parent: 6a7771e460abeac7d401d6d38a0fcf0a0d2c3cbe [diff]
diff --git a/Android.bp b/Android.bp
index 59fb270..d033d2d 100644
--- a/Android.bp
+++ b/Android.bp

@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 ARM Ltd. All rights reserved.
+// Copyright © 2020 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -45,7 +45,8 @@
                          "arm_compute/core/NEON/kernels/assembly",
                          "arm_compute/core/NEON/kernels/convolution/common",
                          "arm_compute/core/NEON/kernels/convolution/depthwise",
-                         "arm_compute/core/NEON/kernels/convolution/winograd"],
+                         "src/core/NEON/kernels/assembly",
+                         "src/core/NEON/kernels/convolution/winograd"],
     export_include_dirs: [".", "./include"],
     srcs: [
         "src/core/AccessWindowAutoPadding.cpp",
@@ -55,6 +56,7 @@
         "src/core/CL/CLCoreRuntimeContext.cpp",
         "src/core/CL/CLHelpers.cpp",
         "src/core/CL/CLKernelLibrary.cpp",
+        "src/core/CL/CLUtils.cpp",
         "src/core/CL/ICLDistribution1D.cpp",
         "src/core/CL/ICLHOG.cpp",
         "src/core/CL/ICLKernel.cpp",
@@ -133,7 +135,6 @@
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp",
-        "src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp",
         "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp",
         "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp",
         "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp",
@@ -158,6 +159,7 @@
         "src/core/CL/kernels/CLLKTrackerKernel.cpp",
         "src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp",
         "src/core/CL/kernels/CLMagnitudePhaseKernel.cpp",
+        "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp",
         "src/core/CL/kernels/CLMeanStdDevKernel.cpp",
         "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp",
         "src/core/CL/kernels/CLMedian3x3Kernel.cpp",
@@ -228,6 +230,7 @@
         "src/core/IDistribution1D.cpp",
         "src/core/IKernel.cpp",
         "src/core/ITensor.cpp",
+        "src/core/ITensorPack.cpp",
         "src/core/MultiImageInfo.cpp",
         "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp",
         "src/core/NEON/kernels/NEAccumulateKernel.cpp",
@@ -274,7 +277,6 @@
         "src/core/NEON/kernels/NEFastCornersKernel.cpp",
         "src/core/NEON/kernels/NEFillArrayKernel.cpp",
         "src/core/NEON/kernels/NEFillBorderKernel.cpp",
-        "src/core/NEON/kernels/NEFillInnerBorderKernel.cpp",
         "src/core/NEON/kernels/NEFlattenLayerKernel.cpp",
         "src/core/NEON/kernels/NEFloorKernel.cpp",
         "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp",
@@ -287,10 +289,8 @@
         "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp",
         "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp",
         "src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp",
-        "src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp",
         "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp",
         "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp",
-        "src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp",
         "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp",
         "src/core/NEON/kernels/NEGatherKernel.cpp",
         "src/core/NEON/kernels/NEGaussian3x3Kernel.cpp",
@@ -309,6 +309,7 @@
         "src/core/NEON/kernels/NELKTrackerKernel.cpp",
         "src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp",
         "src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp",
+        "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp",
         "src/core/NEON/kernels/NEMeanStdDevKernel.cpp",
         "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp",
         "src/core/NEON/kernels/NEMedian3x3Kernel.cpp",
@@ -366,7 +367,6 @@
         "src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
         "src/core/NEON/kernels/arm_gemm/misc.cpp",
         "src/core/NEON/kernels/arm_gemm/quantized.cpp",
-        "src/core/NEON/kernels/assembly/Helpers.cpp",
         "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp",
         "src/core/NEON/kernels/convolution/common/padding.cpp",
         "src/core/NEON/kernels/convolution/common/qasymm8.cpp",
@@ -413,6 +413,7 @@
         "src/core/Utils.cpp",
         "src/core/Validate.cpp",
         "src/core/Version.cpp",
+        "src/core/utils/ScaleUtils.cpp",
         "src/core/utils/helpers/fft.cpp",
         "src/core/utils/helpers/tensor_transform.cpp",
         "src/core/utils/io/FileHandler.cpp",
@@ -435,6 +436,7 @@
         "src/runtime/CL/CLMemoryRegion.cpp",
         "src/runtime/CL/CLMultiHOG.cpp",
         "src/runtime/CL/CLMultiImage.cpp",
+        "src/runtime/CL/CLOperator.cpp",
         "src/runtime/CL/CLPyramid.cpp",
         "src/runtime/CL/CLRuntimeContext.cpp",
         "src/runtime/CL/CLScheduler.cpp",
@@ -519,6 +521,7 @@
         "src/runtime/CL/functions/CLLaplacianReconstruct.cpp",
         "src/runtime/CL/functions/CLLocallyConnectedLayer.cpp",
         "src/runtime/CL/functions/CLMagnitude.cpp",
+        "src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp",
         "src/runtime/CL/functions/CLMeanStdDev.cpp",
         "src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp",
         "src/runtime/CL/functions/CLMedian3x3.cpp",
@@ -603,6 +606,7 @@
         "src/runtime/MemoryManagerOnDemand.cpp",
         "src/runtime/MultiHOG.cpp",
         "src/runtime/MultiImage.cpp",
+        "src/runtime/NEON/INEOperator.cpp",
         "src/runtime/NEON/INESimpleFunction.cpp",
         "src/runtime/NEON/INESimpleFunctionNoBorder.cpp",
         "src/runtime/NEON/functions/NEAbsoluteDifference.cpp",
@@ -685,6 +689,7 @@
         "src/runtime/NEON/functions/NELaplacianReconstruct.cpp",
         "src/runtime/NEON/functions/NELocallyConnectedLayer.cpp",
         "src/runtime/NEON/functions/NEMagnitude.cpp",
+        "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp",
         "src/runtime/NEON/functions/NEMeanStdDev.cpp",
         "src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp",
         "src/runtime/NEON/functions/NEMedian3x3.cpp",
@@ -740,6 +745,7 @@
         "src/runtime/OMP/OMPScheduler.cpp",
         "src/runtime/OffsetLifetimeManager.cpp",
         "src/runtime/OffsetMemoryPool.cpp",
+        "src/runtime/OperatorTensor.cpp",
         "src/runtime/PoolManager.cpp",
         "src/runtime/Pyramid.cpp",
         "src/runtime/RuntimeContext.cpp",
@@ -768,31 +774,35 @@
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp",
@@ -809,21 +819,18 @@
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp",
-                "src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp",

diff --git a/LICENSE b/LICENSE
index 1b316d3..be84736 100644
--- a/LICENSE
+++ b/LICENSE

@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2017-2020 ARM Software
+Copyright (c) 2017-2020 Arm Limited
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/SConscript b/SConscript
index d3d5780..f19122c 100644
--- a/SConscript
+++ b/SConscript

@@ -1,4 +1,4 @@
-# Copyright (c) 2016, 2017 ARM Limited.
+# Copyright (c) 2016, 2017 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -24,9 +24,9 @@
 import re
 import subprocess
 
-VERSION = "v20.05"
-LIBRARY_VERSION_MAJOR = 19
-LIBRARY_VERSION_MINOR =  1
+VERSION = "v20.08"
+LIBRARY_VERSION_MAJOR = 20
+LIBRARY_VERSION_MINOR =  0
 LIBRARY_VERSION_PATCH =  0
 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
 
@@ -178,6 +178,7 @@
 core_files = Glob('src/core/*.cpp')
 core_files += Glob('src/core/CPP/*.cpp')
 core_files += Glob('src/core/CPP/kernels/*.cpp')
+core_files += Glob('src/core/utils/*.cpp')
 core_files += Glob('src/core/utils/helpers/*.cpp')
 core_files += Glob('src/core/utils/io/*.cpp')
 core_files += Glob('src/core/utils/quantization/*.cpp')
@@ -230,6 +231,8 @@
     arm_compute_env.Append(CPPPATH = ["arm_compute/core/NEON/kernels/convolution/common/",
                                       "arm_compute/core/NEON/kernels/convolution/winograd/",
                                       "arm_compute/core/NEON/kernels/convolution/depthwise/",
+                                      "src/core/NEON/kernels/assembly/",
+                                      "src/core/NEON/kernels/convolution/winograd/",
                                       "arm_compute/core/NEON/kernels/assembly/"])
 
     graph_files += Glob('src/graph/backends/NEON/*.cpp')

diff --git a/SConstruct b/SConstruct
index a0f6520..373e561 100644
--- a/SConstruct
+++ b/SConstruct

@@ -1,4 +1,4 @@
-# Copyright (c) 2016, 2017 ARM Limited.
+# Copyright (c) 2016, 2017 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -204,7 +204,7 @@
         env.Append(CXXFLAGS = ['-march=armv8-a'])
 
     if 'v8.6-a' in env['arch']:
-        env.Append(CPPDEFINES = ['V8P6', 'V8P6_BF', 'ARM_COMPUTE_FORCE_BF16'])
+        env.Append(CPPDEFINES = ['MMLA_INT8', 'MMLA_FP32', 'V8P6', 'V8P6_BF', 'ARM_COMPUTE_FORCE_BF16'])
 
 elif 'x86' in env['arch']:
     if env['estate'] == '32':

diff --git a/arm_compute/core/AccessWindowAutoPadding.h b/arm_compute/core/AccessWindowAutoPadding.h
index 8a182c6..12d6553 100644
--- a/arm_compute/core/AccessWindowAutoPadding.h
+++ b/arm_compute/core/AccessWindowAutoPadding.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/AccessWindowStatic.h b/arm_compute/core/AccessWindowStatic.h
index e40c188..1f2ca1b 100644
--- a/arm_compute/core/AccessWindowStatic.h
+++ b/arm_compute/core/AccessWindowStatic.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/AccessWindowTranspose.h b/arm_compute/core/AccessWindowTranspose.h
index 16105bc..8570909 100644
--- a/arm_compute/core/AccessWindowTranspose.h
+++ b/arm_compute/core/AccessWindowTranspose.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index 2b6d8cd..f54fd01 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/CLCoreRuntimeContext.h b/arm_compute/core/CL/CLCoreRuntimeContext.h
index 2b2269d..23f2823 100644
--- a/arm_compute/core/CL/CLCoreRuntimeContext.h
+++ b/arm_compute/core/CL/CLCoreRuntimeContext.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/CLDevice.h b/arm_compute/core/CL/CLDevice.h
index 8128347..033bf8f 100644
--- a/arm_compute/core/CL/CLDevice.h
+++ b/arm_compute/core/CL/CLDevice.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index c5db66c..cf18e16 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,14 +97,6 @@
  */
 std::string get_data_size_from_data_type(const DataType &dt);
 
-/** Translates fixed point tensor data type to the underlying OpenCL type.
- *
- * @param[in] dt @ref DataType to be translated to OpenCL type.
- *
- * @return The string specifying the underlying OpenCL type to be used.
- */
-std::string get_underlying_cl_type_from_data_type(const DataType &dt);
-
 /** Helper function to get the GPU target from CL device
  *
  * @param[in] device A CL device
@@ -121,6 +113,14 @@
  */
 CLVersion get_cl_version(const cl::Device &device);
 
+/** Helper function to get the cl_image pitch alignment in pixels
+ *
+ * @param[in] device A CL device
+ *
+ * @return the cl_image pitch alignment in pixels. If an error occurs, the function will return 0
+ */
+size_t get_cl_image_pitch_alignment(const cl::Device &device);
+
 /** Helper function to check whether a given extension is supported
  *
  * @param[in] device         A CL device
@@ -188,6 +188,14 @@
  */
 bool preferred_dummy_work_items_support(const cl::Device &device);
 
+/** Helper function to check whether the cl_khr_image2d_from_buffer extension is supported
+ *
+ * @param[in] device A CL device
+ *
+ * @return True if the extension is supported
+ */
+bool image2d_from_buffer_supported(const cl::Device &device);
+
 /** Creates an opencl kernel
  *
  * @param[in] ctx         A context to be used to create the opencl kernel.

diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
index 6c5df6c..caab78d 100644
--- a/arm_compute/core/CL/CLKernelLibrary.h
+++ b/arm_compute/core/CL/CLKernelLibrary.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index cd26399..dbda0db 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -84,7 +84,6 @@
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
@@ -109,6 +108,7 @@
 #include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
 #include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
 #include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 #include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"

diff --git a/arm_compute/core/CL/CLTypes.h b/arm_compute/core/CL/CLTypes.h
index 3643b17..c44e2c4 100644
--- a/arm_compute/core/CL/CLTypes.h
+++ b/arm_compute/core/CL/CLTypes.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/CLValidate.h b/arm_compute/core/CL/CLValidate.h
index 8f1733d..3f8b76b 100644
--- a/arm_compute/core/CL/CLValidate.h
+++ b/arm_compute/core/CL/CLValidate.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLArray.h b/arm_compute/core/CL/ICLArray.h
index e11fb95..2fa2f34 100644
--- a/arm_compute/core/CL/ICLArray.h
+++ b/arm_compute/core/CL/ICLArray.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLDistribution1D.h b/arm_compute/core/CL/ICLDistribution1D.h
index a9bafe3..18afabd 100644
--- a/arm_compute/core/CL/ICLDistribution1D.h
+++ b/arm_compute/core/CL/ICLDistribution1D.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLGEMMKernelConfiguration.h b/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
index e5f4a78..90600ef 100644
--- a/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
+++ b/arm_compute/core/CL/ICLGEMMKernelConfiguration.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLHOG.h b/arm_compute/core/CL/ICLHOG.h
index b42566e..da3c0c6 100644
--- a/arm_compute/core/CL/ICLHOG.h
+++ b/arm_compute/core/CL/ICLHOG.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
index 3e545c6..d4990a1 100644
--- a/arm_compute/core/CL/ICLKernel.h
+++ b/arm_compute/core/CL/ICLKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/experimental/Types.h"
 
 #include <string>
 
@@ -216,7 +217,22 @@
      * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      * @param[in,out] queue  Command queue on which to enqueue the kernel.
      */
-    virtual void run(const Window &window, cl::CommandQueue &queue) = 0;
+    virtual void run(const Window &window, cl::CommandQueue &queue)
+    {
+        ARM_COMPUTE_UNUSED(window, queue);
+    }
+    /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
+     *
+     * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
+     *
+     * @param[in]     tensors A vector containing the tensors to operato on.
+     * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue   Command queue on which to enqueue the kernel.
+     */
+    virtual void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+    {
+        ARM_COMPUTE_UNUSED(tensors, window, queue);
+    }
     /** Add the passed parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx   Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.

diff --git a/arm_compute/core/CL/ICLLut.h b/arm_compute/core/CL/ICLLut.h
index 430adb8..b4d7471 100644
--- a/arm_compute/core/CL/ICLLut.h
+++ b/arm_compute/core/CL/ICLLut.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLMultiHOG.h b/arm_compute/core/CL/ICLMultiHOG.h
index f921301..109b4d4 100644
--- a/arm_compute/core/CL/ICLMultiHOG.h
+++ b/arm_compute/core/CL/ICLMultiHOG.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLMultiImage.h b/arm_compute/core/CL/ICLMultiImage.h
index 0233600..23ed04a 100644
--- a/arm_compute/core/CL/ICLMultiImage.h
+++ b/arm_compute/core/CL/ICLMultiImage.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLSimple2DKernel.h b/arm_compute/core/CL/ICLSimple2DKernel.h
index bd42330..86561cd 100644
--- a/arm_compute/core/CL/ICLSimple2DKernel.h
+++ b/arm_compute/core/CL/ICLSimple2DKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLSimple3DKernel.h b/arm_compute/core/CL/ICLSimple3DKernel.h
index e25051f..3b4eaf7 100644
--- a/arm_compute/core/CL/ICLSimple3DKernel.h
+++ b/arm_compute/core/CL/ICLSimple3DKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLSimpleKernel.h b/arm_compute/core/CL/ICLSimpleKernel.h
index e8b6f0a..805342f 100644
--- a/arm_compute/core/CL/ICLSimpleKernel.h
+++ b/arm_compute/core/CL/ICLSimpleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/ICLTensor.h b/arm_compute/core/CL/ICLTensor.h
index 001f892..fd05e64 100644
--- a/arm_compute/core/CL/ICLTensor.h
+++ b/arm_compute/core/CL/ICLTensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index b87cc67..f9796d7 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -134,6 +134,7 @@
     DECLARE_FUNCTION_PTR(clEnqueueSVMUnmap);
     DECLARE_FUNCTION_PTR(clEnqueueMarker);
     DECLARE_FUNCTION_PTR(clWaitForEvents);
+    DECLARE_FUNCTION_PTR(clCreateImage);
 
     // Third-party extensions
     DECLARE_FUNCTION_PTR(clImportMemoryARM);

diff --git a/arm_compute/core/CL/gemm/CLGEMMHelpers.h b/arm_compute/core/CL/gemm/CLGEMMHelpers.h
index dcda732..013c068 100644
--- a/arm_compute/core/CL/gemm/CLGEMMHelpers.h
+++ b/arm_compute/core/CL/gemm/CLGEMMHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,30 +24,50 @@
 #ifndef ARM_COMPUTE_CLGEMMHELPERS_H
 #define ARM_COMPUTE_CLGEMMHELPERS_H
 
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
+class ITensorInfo;
+struct GEMMRHSMatrixInfo;
+
 namespace cl_gemm
 {
 /** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  *
- * @param[in] m              Number of rows (M) in the LHS matrix not reshaped
- * @param[in] n              Number of columns (N) in the RHS matrix not reshaped
- * @param[in] m0             Number of rows processed by each thread/work-item
- * @param[in] n0             Number of columns processed by each thread/work-item
- * @param[in] k0             Number of inner accumulation performed by each thread/work-item
- * @param[in] v0             Number of vertical blocks of size (m0xk0) stored on the same output row
- * @param[in] h0             Number of horizontal blocks of size (k0xn0) stored on the same output row
- * @param[in] lhs_interleave True if the v0 (m0xk0) blocks have to be interleaved in the output row
- * @param[in] rhs_interleave True if the h0 (k0xn0) blocks have to be interleaved in the output row
- * @param[in] lhs_transpose  True if the (m0xk0) block has to be transposed before been stored
- * @param[in] rhs_transpose  True if the (k0xn0) block has to be transposed before been stored
+ * @param[in] m                  Number of rows (M) in the LHS matrix not reshaped
+ * @param[in] n                  Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] m0                 Number of rows processed by each thread/work-item
+ * @param[in] n0                 Number of columns processed by each thread/work-item
+ * @param[in] k0                 Number of inner accumulation performed by each thread/work-item
+ * @param[in] v0                 Number of vertical blocks of size (m0xk0) stored on the same output row
+ * @param[in] h0                 Number of horizontal blocks of size (k0xn0) stored on the same output row
+ * @param[in] lhs_interleave     True if the v0 (m0xk0) blocks have to be interleaved in the output row
+ * @param[in] rhs_interleave     True if the h0 (k0xn0) blocks have to be interleaved in the output row
+ * @param[in] lhs_transpose      True if the (m0xk0) block has to be transposed before been stored
+ * @param[in] rhs_transpose      True if the (k0xn0) block has to be transposed before been stored
+ * @param[in] export_to_cl_image (Optional) True if the RHS reshaped matrix has to be exported to cl_image
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose);
+                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false);
+
+/** Update padding required to export the OpenCL buffer to OpenCL image2d
+ *
+ * @param[in,out] tensor ITensorInfo of the tensor required to be exported to OpenCL image2d
+ */
+void update_padding_for_cl_image(ITensorInfo *tensor);
+
+/** Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix
+ *
+ * @param[in] tensor_reshaped_info TensorInfo for the RHS reshaped matrix
+ * @param[in] rhs_info             @ref GEMMRHSMatrixInfo
+ *
+ * @return Status reporting if we can use the image2d OpenCL object on the RHS reshaped matrix
+ */
+Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info);
 } // namespace cl_gemm
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLGEMMHELPERS_H */

diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
index a6341e5..7270a8e 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
index 5b2abe6..1e49896 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
index 0e95a15..4cebfce 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
index e739997..07389ea 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
index 10dc9ae..b953fd2 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
index 55742e3..4df2784 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
index e659741..7a617e0 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
index 7909726..6d5ce88 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
index 044bdc7..346bfd7 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
index 6dba6fd..2162baf 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h b/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
index 58dea3b..f62855c 100644
--- a/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
+++ b/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLAccumulateKernel.h b/arm_compute/core/CL/kernels/CLAccumulateKernel.h
index f639148..e067da0 100644
--- a/arm_compute/core/CL/kernels/CLAccumulateKernel.h
+++ b/arm_compute/core/CL/kernels/CLAccumulateKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
index 1e83a68..81d4ccb 100644
--- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,23 +49,13 @@
      *
      * @note If the output tensor is a nullptr, the activation function will be performed in-place
      *
-     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                          of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     output   Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info Activation layer information.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
      * @param[in]      compile_context The compile context to be used.
      * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
      *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
      * @param[out]     output          Destination tensor. Data type supported: same as @p input
      * @param[in]      act_info        Activation layer information.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *output, ActivationLayerInfo act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
@@ -78,12 +68,10 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
+    bool _run_in_place;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h b/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h
index 94e8bae..48876c0 100644
--- a/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,7 +56,7 @@
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input       Source tensor. Data types supported: S32/F16/F32.
+     * @param[in]  input       Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
      * @param[in]  prev_output Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
      *                         Has to be nullptr for the first iteration
      * @param[out] output      Destination tensor. Data types supported: U32/S32
@@ -68,7 +68,7 @@
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: S32/F16/F32.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
      * @param[in]  prev_output     Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
      *                             Has to be nullptr for the first iteration
      * @param[out] output          Destination tensor. Data types supported: U32/S32
@@ -80,7 +80,7 @@
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
      *
-     * @param[in] input       Source tensor info. Data types supported: S32/F16/F32.
+     * @param[in] input       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
      * @param[in] prev_output Destination tensor info of the previous iterations. Data types supported: U32/S32
      *                        Has to be nullptr for the first iteration
      * @param[in] output      Destination tensor info. Data types supported: U32/S32

diff --git a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
index 1636668..bb8968c 100644
--- a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,17 +52,6 @@
     ~CLBatchConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: All.
-     * @param[in]     batch_offset The offset on axis # 3.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int batch_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
      * @param[in]     compile_context The compile context to be used.
      * @param[in]     input           Input tensor. Data types supported: All.
      * @param[in]     batch_offset    The offset on axis # 3.
@@ -72,7 +61,7 @@
      * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int batch_offset, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref CLBatchConcatenateLayerKernel
      *
      * @param[in] input        Input tensor info. Data types supported: All.
@@ -84,12 +73,10 @@
     static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _batch_offset;
+    unsigned int _batch_offset;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index 8eaaca8..c556a0c 100644
--- a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h
index 2b12ad0..7af88d8 100644
--- a/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h b/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
index 8defe32..e291f08 100644
--- a/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
+++ b/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h b/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
index b86ce7f..f57bbf4 100644
--- a/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
+++ b/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h b/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
index 65eb50f..944224e 100644
--- a/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
+++ b/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h b/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
index 5c63a7f..732ae86 100644
--- a/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
+++ b/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h b/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h
index bbe1156..4e8c5a6 100644
--- a/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ b/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLBox3x3Kernel.h b/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
index ea3c1c1..1a8572d 100644
--- a/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
+++ b/arm_compute/core/CL/kernels/CLBox3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h b/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
index 40ad4dc..c4d0297 100644
--- a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
+++ b/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,6 @@
 
 #include "arm_compute/core/CL/ICLKernel.h"
 
-#include <cstdint>
-
 namespace arm_compute
 {
 class ICLTensor;
@@ -136,7 +134,7 @@
     CLEdgeTraceKernel &operator=(const CLEdgeTraceKernel &) = delete;
     /** Initialise the kernel's source, destination and border mode.
      *
-     * @param[in]     input            Source tensor. Data types supported: U8.
+     * @param[in]     input            Source tensor. Data types supported: U16/U32.
      * @param[out]    output           Destination tensor. Data types supported: U8.
      * @param[in]     upper_thr        Upper threshold used for the hysteresis
      * @param[in]     lower_thr        Lower threshold used for the hysteresis
@@ -154,7 +152,7 @@
     /** Initialise the kernel's source, destination and border mode.
      *
      * @param[in]     compile_context  The compile context to be used.
-     * @param[in]     input            Source tensor. Data types supported: U8.
+     * @param[in]     input            Source tensor. Data types supported: U16/U32.
      * @param[out]    output           Destination tensor. Data types supported: U8.
      * @param[in]     upper_thr        Upper threshold used for the hysteresis
      * @param[in]     lower_thr        Lower threshold used for the hysteresis

diff --git a/arm_compute/core/CL/kernels/CLChannelCombineKernel.h b/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
index 32ddf15..f9c33df 100644
--- a/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
+++ b/arm_compute/core/CL/kernels/CLChannelCombineKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,7 @@
      * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
      * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
      * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output The single planar output tensor.
+     * @param[out] output The single planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
      */
     void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
     /** Configure function's inputs and outputs.
@@ -75,7 +75,7 @@
      * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
      * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
      * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output The multi planar output tensor.
+     * @param[out] output The multi planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
      */
     void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
     /** Configure function's inputs and outputs.
@@ -84,7 +84,7 @@
      * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
      * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
      * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output          The multi planar output tensor.
+     * @param[out] output          The multi planar output tensor. Supported formats: RGB888/RGBA8888/YUYV422/UYVY422.
      */
     void configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
 

diff --git a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
index 6a0c4bb..1ccf38b 100644
--- a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
+++ b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h b/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
index 14b59d3..bf58525 100644
--- a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
index d0528ed..c3a1ff3 100644
--- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h
+++ b/arm_compute/core/CL/kernels/CLCol2ImKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLColorConvertKernel.h b/arm_compute/core/CL/kernels/CLColorConvertKernel.h
index 2bcd141..d57bb3d 100644
--- a/arm_compute/core/CL/kernels/CLColorConvertKernel.h
+++ b/arm_compute/core/CL/kernels/CLColorConvertKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLComparisonKernel.h b/arm_compute/core/CL/kernels/CLComparisonKernel.h
index d5c5297..bbf5f19 100644
--- a/arm_compute/core/CL/kernels/CLComparisonKernel.h
+++ b/arm_compute/core/CL/kernels/CLComparisonKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h b/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
index d3e57a6..5d9e9bd 100644
--- a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
+++ b/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLConvolutionKernel.h b/arm_compute/core/CL/kernels/CLConvolutionKernel.h
index b6fe51d..0f500fb 100644
--- a/arm_compute/core/CL/kernels/CLConvolutionKernel.h
+++ b/arm_compute/core/CL/kernels/CLConvolutionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -108,7 +108,7 @@
      *
      * @param[in]  compile_context  The compile context to be used.
      * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: S16.
+     * @param[out] output           Destination tensor, Data types supported: U16/S16/S32.
      * @param[in]  conv             Convolution matrix to apply to the input tensor.
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
      */
@@ -135,7 +135,7 @@
 public:
     /** Initialise the kernel's input, output and border mode.
      *
-     * @param[in]  input            Source tensor. Data types supported: S16.
+     * @param[in]  input            Source tensor. Data types supported: U16/S16/S32.
      * @param[out] output           Destination tensor, Data types supported: U8, S16.
      * @param[in]  conv             Convolution matrix to apply to the input tensor.
      * @param[in]  scale            Scale of the convolution matrix.
@@ -146,7 +146,7 @@
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: S16.
+     * @param[in]  input            Source tensor. Data types supported: U16/S16/S32.
      * @param[out] output           Destination tensor, Data types supported: U8, S16.
      * @param[in]  conv             Convolution matrix to apply to the input tensor.
      * @param[in]  scale            Scale of the convolution matrix.

diff --git a/arm_compute/core/CL/kernels/CLCopyKernel.h b/arm_compute/core/CL/kernels/CLCopyKernel.h
index 05dff8e..11a6d54 100644
--- a/arm_compute/core/CL/kernels/CLCopyKernel.h
+++ b/arm_compute/core/CL/kernels/CLCopyKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@
     CLCopyKernel &operator=(CLCopyKernel &&) = default;
     /** Initialize the kernel's input, output.
      *
-     * @param[in]  input         Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in]  input         Source tensor. Data types supported: All.
      * @param[out] output        Destination tensor. Data types supported: same as @p input.
      * @param[in]  padding       (Optional) Padding to be applied to the input tensor
      * @param[in]  output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
@@ -56,7 +56,7 @@
     /** Initialize the kernel's input, output.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in]  input           Source tensor. Data types supported: All.
      * @param[out] output          Destination tensor. Data types supported: same as @p input.
      * @param[in]  padding         (Optional) Padding to be applied to the input tensor
      * @param[in]  output_window   (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
@@ -64,7 +64,7 @@
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCopyKernel
      *
-     * @param[in] input         Source tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in] input         Source tensor info. Data types supported: All.
      * @param[in] output        Destination tensor info. Data types supported: same as @p input.
      * @param[in] padding       (Optional) Padding to be applied to the input tensor
      * @param[in] output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.

diff --git a/arm_compute/core/CL/kernels/CLCropKernel.h b/arm_compute/core/CL/kernels/CLCropKernel.h
index a1c6f90..91d70e6 100644
--- a/arm_compute/core/CL/kernels/CLCropKernel.h
+++ b/arm_compute/core/CL/kernels/CLCropKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@
      *
      * @note Supported tensor rank: up to 4
      *
-     * @param[in]  input               Source tensor. Data type supported: U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
+     * @param[in]  input               Source tensor. Data type supported: All. Data layouts supported: NHWC.
      * @param[out] output              Destination tensor. Data type supported: F32
      * @param[in]  start               Coordinates of where to start cropping the image.
      * @param[in]  end                 Coordinates of where to end cropping the image.
@@ -63,7 +63,7 @@
      * @note Supported tensor rank: up to 4
      *
      * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  input               Source tensor. Data type supported: U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
+     * @param[in]  input               Source tensor. Data type supported: All. Data layouts supported: NHWC.
      * @param[out] output              Destination tensor. Data type supported: F32
      * @param[in]  start               Coordinates of where to start cropping the image.
      * @param[in]  end                 Coordinates of where to end cropping the image.
@@ -78,7 +78,7 @@
      *
      * @note Supported tensor rank: up to 4
      *
-     * @param[in] input               Source tensor info. Data type supported: U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
+     * @param[in] input               Source tensor info. Data type supported: All. Data layouts supported: NHWC.
      * @param[in] output              Destination tensor info. Data type supported: F32
      * @param[in] start               Coordinates of where to start cropping the image.
      * @param[in] end                 Coordinates of where to end cropping the image.

diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index 0c65f51..84265a2 100644
--- a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
index 292c561..688c943 100644
--- a/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ b/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
index 5fe826d..d8493bc 100644
--- a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,6 @@
 
 namespace arm_compute
 {
-class ICLTensor;
-
 /** Interface for the depth concatenate kernel.
  *  The input tensor will be concatenated into the output tensor.
  */
@@ -52,17 +50,6 @@
     ~CLDepthConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
      * @param[in]     compile_context The compile context to be used.
      * @param[in]     input           Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]     depth_offset    The offset on the Z axis.
@@ -72,7 +59,7 @@
      * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int depth_offset, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
      *
      * @param[in] input        Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -84,12 +71,10 @@
     static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _depth_offset;
+    unsigned int _depth_offset;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
index 66eb622..7f9696d 100644
--- a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
index 87ac3c1..1bd1e8e 100644
--- a/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
index 6cf0326..93e7e37 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
index e564cf6..4ca6c0b 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@
     /** Default move assignment operator. */
     /** Initialize the function's source, destination, conv and border_size.
      *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
      *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
@@ -61,7 +61,7 @@
     /** Initialize the function's source, destination, conv and border_size.
      *
      * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
      *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
@@ -81,7 +81,7 @@
                    const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
      *
-     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3].
      *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
      * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.

diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 8847cf9..03a0106 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
index 8dc5d32..51aaf17 100644
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
index bb154f1..7a582da 100644
--- a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLDerivativeKernel.h b/arm_compute/core/CL/kernels/CLDerivativeKernel.h
index cd8ae90..b49905a 100644
--- a/arm_compute/core/CL/kernels/CLDerivativeKernel.h
+++ b/arm_compute/core/CL/kernels/CLDerivativeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLDilateKernel.h b/arm_compute/core/CL/kernels/CLDilateKernel.h
index 45f5fe0..747f8fa 100644
--- a/arm_compute/core/CL/kernels/CLDilateKernel.h
+++ b/arm_compute/core/CL/kernels/CLDilateKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
index 489d7c2..5281a0c 100644
--- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,7 @@
      *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
      *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
      *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2, data_layout=NHWC
+     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
      *
      * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
      *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
@@ -74,7 +74,7 @@
      *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
      *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
      *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2, data_layout=NHWC
+     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],

diff --git a/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h b/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
index e190bde..82cd953 100644
--- a/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,27 +30,25 @@
 
 namespace arm_compute
 {
-class ICLTensor;
-
 /** Interface for the elementwise unary operator */
-class CLElementWiseUnaryLayerKernel : public ICLSimpleKernel
+class CLElementWiseUnaryLayerKernel : public ICLKernel
 {
 public:
     /** Initialise the kernel's inputs, output.
      *
-     * @param[in]  input  First tensor input. Data types supported: F16/F32.
-     * @param[out] output Output tensor. Data types supported: Same as @p input.
+     * @param[in]  input  First tensor input info. Data types supported: F16/F32.
+     * @param[out] output Output tensor info. Data types supported: Same as @p input.
      * @param[in]  op     Element wise unary operation to perform.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op);
+    void configure(const ITensorInfo *input, ITensorInfo *output, const ElementWiseUnary &op);
     /** Initialise the kernel's inputs, output.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           First tensor input. Data types supported: F16/F32.
-     * @param[out] output          Output tensor. Data types supported: Same as @p input.
+     * @param[in]  input           First tensor input info. Data types supported: F16/F32.
+     * @param[out] output          Output tensor info. Data types supported: Same as @p input.
      * @param[in]  op              Element wise unary operation to perform.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op);
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const ElementWiseUnary &op);
     /** Static function to check if given info will lead to a valid configuration of @ref CLElementWiseUnaryLayerKernel
      *
      * @param[in] input  First tensor input info. Data types supported: F16/F32.
@@ -62,7 +60,7 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ElementWiseUnary &op);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h b/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
index 4d3d4bc..b459292 100644
--- a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
+++ b/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,8 +54,7 @@
     ~CLElementwiseOperationKernel() = default;
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 protected:
@@ -64,22 +63,14 @@
 
     /** Initialise the kernel's output.
      *
-     * @param[in] input1 First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
+     * @param[in] input1 First tensor input info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
      *
      * @return a pair of Status and Window
      */
     virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) = 0;
 
-    /** Validate the argument passed to the kernel
-     *
-     * @param[in] input1 First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
-     */
-    virtual Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) = 0;
-
     /** Generate the build options for the specific kernel
      *
      * @reutrn a CLBuildOptions struct
@@ -95,18 +86,18 @@
     /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
      *
      */
-    void configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure_common(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
     /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
      *
      */
-    void configure_common(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure_common(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
 
     ActivationLayerInfo _act_info;
 
 private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
+    const ITensorInfo *_input1; /**< Source tensor info 1 */
+    const ITensorInfo *_input2; /**< Source tensor info 2 */
+    ITensorInfo       *_output; /**< Destination tensor info */
 };
 
 /** Addition operation */
@@ -121,32 +112,32 @@
     /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
      *
      * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor. Data types supported: Same as @p input1.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
      * @param[in] policy   Policy to use to handle overflow.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
      *
      * @param[in] compile_context The compile context to be used.
      * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] input1          First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2          Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output          Output tensor. Data types supported: Same as @p input1.
+     * @param[in] input1          First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2          Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output          Output tensor info. Data types supported: Same as @p input1.
      * @param[in] policy          Policy to use to handle overflow.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy,
+    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
      *
      * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] input1   First tensor input info info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2   Second tensor input info info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info info. Data types supported: Same as @p input1.
      * @param[in] policy   Policy to use to handle overflow.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
@@ -159,7 +150,6 @@
     // Inherited methods overridden:
     std::string name() override;
     std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
     CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
     std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
 
@@ -179,28 +169,28 @@
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
      *
      * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor. Data types supported: Same as @p input1.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
      *
      * @param[in] compile_context The compile context to be used.
      * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] input1          First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2          Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output          Output tensor. Data types supported: Same as @p input1.
+     * @param[in] input1          First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2          Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output          Output tensor info. Data types supported: Same as @p input1.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
      *
      * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
      * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
@@ -213,7 +203,6 @@
     // Inherited methods overridden:
     std::string name() override;
     std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
     CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
     std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
 

diff --git a/arm_compute/core/CL/kernels/CLErodeKernel.h b/arm_compute/core/CL/kernels/CLErodeKernel.h
index cbc7481..620201d 100644
--- a/arm_compute/core/CL/kernels/CLErodeKernel.h
+++ b/arm_compute/core/CL/kernels/CLErodeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h b/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h
index a8da124..a196c8c 100644
--- a/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ b/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h b/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h
index e3f5346..d6d6067 100644
--- a/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h
+++ b/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLFFTScaleKernel.h b/arm_compute/core/CL/kernels/CLFFTScaleKernel.h
index d0d2b76..c6dd176 100644
--- a/arm_compute/core/CL/kernels/CLFFTScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLFFTScaleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLFastCornersKernel.h b/arm_compute/core/CL/kernels/CLFastCornersKernel.h
index 1a0d4e3..5d0da7d 100644
--- a/arm_compute/core/CL/kernels/CLFastCornersKernel.h
+++ b/arm_compute/core/CL/kernels/CLFastCornersKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
index d00ea55..5323af4 100644
--- a/arm_compute/core/CL/kernels/CLFillBorderKernel.h
+++ b/arm_compute/core/CL/kernels/CLFillBorderKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,6 +51,15 @@
 
     /** Initialise the kernel's input, output and border mode.
      *
+     * @param[in]     compile_context       The compile context to be used.
+     * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
+     * @param[in]     border_size           Size of the border to fill in elements.
+     * @param[in]     border_mode           Border mode to use for the convolution.
+     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    /** Initialise the kernel's input, output and border mode.
+     *
      * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
      * @param[in]     border_size           Size of the border to fill in elements.
      * @param[in]     border_mode           Border mode to use for the convolution.
@@ -65,7 +74,7 @@
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
 
     /** Function to set the constant value on fill border kernel depending on type.
      *
@@ -76,6 +85,7 @@
     void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
 
     // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
     void run(const Window &window, cl::CommandQueue &queue) override;
     bool is_parallelisable() const override;
 

diff --git a/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h b/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h
index ab009e1..4df0b33 100644
--- a/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLFloorKernel.h b/arm_compute/core/CL/kernels/CLFloorKernel.h
index 4d1ed78..3b1d3f1 100644
--- a/arm_compute/core/CL/kernels/CLFloorKernel.h
+++ b/arm_compute/core/CL/kernels/CLFloorKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h
index 2fe6b22..3ec251c 100644
--- a/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ b/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
index 15fd208..2d5e4a3 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
index 43526b7..f2eb447 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
index 1aba6c0..a229514 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
index bc982c6..1d3b311 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
index 583b388..e3f88c1 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
index 1e9fde8..0b3f23d 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
index 766ef9a..767d792 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 6f58150..16990c5 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index 0c237be..ef962d8 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index cb3e12e..ca13b2f 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
index 857b1c7..6066e2a 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,7 +48,7 @@
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data type supported: S8
+     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
      * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
      * @param[in]  info   Kernel metadata:
      *                    - k            Number of matrix columns/rows depending on the type of reduction.
@@ -60,7 +60,7 @@
     /** Initialise the kernel's input and output.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S8
+     * @param[in]  input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
      * @param[out] output          Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
      * @param[in]  info            Kernel metadata:
      *                             - k            Number of matrix columns/rows depending on the type of reduction.
@@ -85,7 +85,7 @@
 public:
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
      * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
      * @param[in]  info           Kernel metadata:
      *                            - k            Number of matrix columns/rows depending on the type of reduction.
@@ -97,7 +97,7 @@
     /** Initialise the kernel's input and output.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
      * @param[out] vector_sum_row  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
      * @param[in]  info            Kernel metadata:
      *                             - k            Number of matrix columns/rows depending on the type of reduction.
@@ -108,7 +108,7 @@
     void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel
      *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
      * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
      * @param[in] info           Kernel metadata:
      *                           - k            Number of matrix columns/rows depending on the type of reduction.
@@ -134,7 +134,7 @@
 public:
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
      * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
      * @param[in]  info           Kernel metadata:
      *                            - k            Number of matrix columns/rows depending on the type of reduction.
@@ -146,7 +146,7 @@
     /** Initialise the kernel's input and output.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
      * @param[out] vector_sum_col  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
      * @param[in]  info            Kernel metadata:
      *                             - k            Number of matrix columns/rows depending on the type of reduction.
@@ -157,7 +157,7 @@
     void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel
      *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
      * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
      * @param[in] info           Kernel metadata:
      *                           - k            Number of matrix columns/rows depending on the type of reduction.

diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
deleted file mode 100644
index df2f6f4..0000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
+++ /dev/null

@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface to add a bias to each row of the input tensor
- *
- */
-class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMMatrixAccumulateBiasesKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixAccumulateBiasesKernel &operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     */
-    void configure(ICLTensor *accum, const ICLTensor *biases);
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] accum           The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in]      biases          The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *accum, const ICLTensor *biases);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixAccumulateBiasesKernel
-     *
-     * @param[in] accum      The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in] biases     The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     * @param[in] gpu_target GPU target
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor       *_accum;
-    const ICLTensor *_biases;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 6085b34..4abd60c 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
index c711a3d..006b2bf 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
index ee8e57f..9626457 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,19 +55,30 @@
      *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
      *       multiplications. i.e. float c = (half)a * (half)b
      *
-     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
      * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
      * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
      * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
      * @param[in]  alpha     Weight of the matrix product
      * @param[in]  beta      Weight of the matrix bias
-     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
+     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor. Only the following values are supported:
      *                       lhs_info.m0: 2,3,4,5,6,7,8
      *                       lhs_info.k0: 2,3,4,8,16
      *                       lhs_info.transpose: false
      * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: 2,3,4,8,16
+     *                       rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                       rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
      *                       rhs_info.transpose: true
      * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
      *
@@ -82,8 +93,19 @@
      *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
      *       multiplications. i.e. float c = (half)a * (half)b
      *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
+     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32  (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
      * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
      * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
      * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
@@ -94,8 +116,8 @@
      *                             lhs_info.k0: 2,3,4,8,16
      *                             lhs_info.transpose: false
      * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: 2,3,4,8,16
+     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                             rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
      *                             rhs_info.transpose: true
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      *
@@ -107,7 +129,22 @@
                    const GEMMKernelInfo    &gemm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedKernel
      *
-     * @param[in] input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
+     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
+     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
+     *       multiplications. i.e. float c = (half)a * (half)b
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in] input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32  (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
      * @param[in] input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
      * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
      * @param[in] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
@@ -118,8 +155,8 @@
      *                      lhs_info.k0: 2,3,4,8,16
      *                      lhs_info.transpose: false
      * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 2,3,4,8,16
+     *                      rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                      rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
      *                      rhs_info.transpose: true
      * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
      *
@@ -141,10 +178,11 @@
     ICLTensor       *_output;
     bool             _slide_matrix_b;
     bool             _reinterpret_output_as_3d;
-    unsigned int     _k;
     bool             _use_dummy_work_items;
     bool             _add_bias;
     bool             _broadcast_bias;
+    bool             _export_to_cl_image;
+    unsigned int     _k;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H*/
\ No newline at end of file

diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
index f7d314a..fc21f2a 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,19 @@
     CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
+     *                       The number of dimensions for the LHS matrix must be less or equal than 4.
      * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
      * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
      * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
@@ -70,8 +82,20 @@
                    const GEMMKernelInfo    &gemm_info);
     /** Initialise the kernel's input and output.
      *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
+     *                             The number of dimensions for the LHS matrix must be less or equal than 4.
      * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
      * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
      * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
@@ -91,7 +115,19 @@
                    const GEMMKernelInfo    &gemm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
      *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+     *       -# input1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# input1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
+     *                      The number of dimensions for the LHS matrix must be less or equal than 4.
      * @param[in] input1    Input tensor info for the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
      * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
      * @param[in] output    Output tensor info. Data type supported: same as @p input0
@@ -125,6 +161,7 @@
     bool             _use_dummy_work_items;
     bool             _add_bias;
     bool             _broadcast_bias;
+    bool             _export_to_cl_image;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H*/

diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
index 6d70b4b..95ed87d 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
index fe77fcb..0f74cb8 100644
--- a/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
index 0e6352b..5f953dd 100644
--- a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,12 +48,23 @@
     CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+     *       required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *       -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *
      * @param[in]  input    Input tensor. Data types supported: All
      * @param[out] output   Output tensor. Data type supported: same as @p input
      * @param[in]  rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
      *                      information to reshape the input tensor. Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
+     *                      rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                      rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
      *                      rhs_info.h0: greater than 0
      *                      rhs_info.transpose: true, false
      *                      rhs_info.interleave: true, false
@@ -61,13 +72,24 @@
     void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
     /** Initialise the kernel's input and output.
      *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+     *       required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *       -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Input tensor. Data types supported: All
      * @param[out] output          Output tensor. Data type supported: same as @p input
      * @param[in]  rhs_info        RHS matrix information to be used for reshaping. This object contains all the necessary
      *                             information to reshape the input tensor. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
+     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                             rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
      *                             rhs_info.h0: greater than 0
      *                             rhs_info.transpose: true, false
      *                             rhs_info.interleave: true, false
@@ -75,12 +97,23 @@
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel
      *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+     *       required to create a OpenCL image object from buffer in @ref CLGEMMMatrixMultiplyReshapedKernel and in @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *       -# The output tensor should be only consumed by @ref CLGEMMMatrixMultiplyReshapedKernel or @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
+     *
      * @param[in] input    Input tensor info. Data types supported: All
      * @param[in] output   Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
      * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
      *                     information to reshape the input tensor. Only the following values are supported:
-     *                     rhs_info.n0: 2,3,4,8,16
-     *                     rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
+     *                     rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                     rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false),(only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
      *                     rhs_info.h0: greater than 0
      *                     rhs_info.transpose: true, false
      *                     rhs_info.interleave: true, false

diff --git a/arm_compute/core/CL/kernels/CLGatherKernel.h b/arm_compute/core/CL/kernels/CLGatherKernel.h
index b753953..c8a9632 100644
--- a/arm_compute/core/CL/kernels/CLGatherKernel.h
+++ b/arm_compute/core/CL/kernels/CLGatherKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h b/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
index 6a9d3ea..a783527 100644
--- a/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
+++ b/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h b/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
index d8730e0..e8c2268 100644
--- a/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
+++ b/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h b/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
index 34cd062..36e095d 100644
--- a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
+++ b/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h
index 46dc16d..9dfe4a4 100644
--- a/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h b/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
index 0469505..c001aa2 100644
--- a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
+++ b/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h b/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
index 681c212..dc9bba8 100644
--- a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
+++ b/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h b/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
index a13119b..38a2f04 100644
--- a/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
+++ b/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
index 524e5ea..4fa2b40 100644
--- a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,6 @@
 
 namespace arm_compute
 {
-class ICLTensor;
-
 /** Interface for the height concatenate kernel.
  *  The input tensor will be concatenated into the output tensor.
  */
@@ -52,21 +50,13 @@
     ~CLHeightConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]  input         Input tensor. Data types supported: All.
-     * @param[in]  height_offset The starting offset on the Y axis for the output tensor.
-     * @param[out] output        Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int height_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Input tensor. Data types supported: All.
      * @param[in]  height_offset   The starting offset on the Y axis for the output tensor.
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int height_offset, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int height_offset, ITensorInfo *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref CLHeightConcatenateLayerKernel
      *
      * @param[in] input         Input tensor info. Data types supported: All.
@@ -78,13 +68,11 @@
     static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _height_offset;
-    unsigned int     _num_elems_processed_per_iteration;
+    unsigned int _height_offset;
+    unsigned int _num_elems_processed_per_iteration;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLHistogramKernel.h b/arm_compute/core/CL/kernels/CLHistogramKernel.h
index 9cd3747..7cb79db 100644
--- a/arm_compute/core/CL/kernels/CLHistogramKernel.h
+++ b/arm_compute/core/CL/kernels/CLHistogramKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
index 61f2a3d..7b7bd03 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
index 014dce1..a3fdd3c 100644
--- a/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLIntegralImageKernel.h b/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
index 6b6076a..cef699a 100644
--- a/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
+++ b/arm_compute/core/CL/kernels/CLIntegralImageKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h b/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h
index 169910b..55fe563 100644
--- a/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h b/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
index f94602c..fdc2ef8 100644
--- a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
+++ b/arm_compute/core/CL/kernels/CLLKTrackerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
index e68160f..d5653f8 100644
--- a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h b/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
index e0de3e7..a741b17 100644
--- a/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
+++ b/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,9 +51,9 @@
      *
      * @note At least one of output1 or output2 must be set.
      *
-     * @param[in]  gx         The input gradient X tensor. Data types supported: S16.
-     * @param[in]  gy         The input gradient Y tensor. Data types supported: S16.
-     * @param[out] magnitude  (Optional) The output tensor - Magnitude. Data types supported: S16.
+     * @param[in]  gx         The input gradient X tensor. Data types supported: S16/S32.
+     * @param[in]  gy         The input gradient Y tensor. Data types supported: S16/S32.
+     * @param[out] magnitude  (Optional) The output tensor - Magnitude. Data types supported: S16/S32.
      * @param[out] phase      (Optional) The output tensor - Phase. Data types supported: U8.
      * @param[in]  mag_type   (Optional) Magnitude calculation type. Default: L2NORM.
      * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
@@ -65,9 +65,9 @@
      * @note At least one of output1 or output2 must be set.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  gx              The input gradient X tensor. Data types supported: S16.
-     * @param[in]  gy              The input gradient Y tensor. Data types supported: S16.
-     * @param[out] magnitude       (Optional) The output tensor - Magnitude. Data types supported: S16.
+     * @param[in]  gx              The input gradient X tensor. Data types supported: S16/S32.
+     * @param[in]  gy              The input gradient Y tensor. Data types supported: S16/S32.
+     * @param[out] magnitude       (Optional) The output tensor - Magnitude. Data types supported: S16/S32.
      * @param[out] phase           (Optional) The output tensor - Phase. Data types supported: U8.
      * @param[in]  mag_type        (Optional) Magnitude calculation type. Default: L2NORM.
      * @param[in]  phase_type      (Optional) Phase calculation type. Default: SIGNED.

diff --git a/arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
new file mode 100644
index 0000000..9d51f6b
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h

@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the pooling layer kernel */
+class CLMaxUnpoolingLayerKernel : public ICLKernel
+{
+public:
+    /** Default constructor */
+    CLMaxUnpoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMaxUnpoolingLayerKernel(const CLMaxUnpoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMaxUnpoolingLayerKernel &operator=(const CLMaxUnpoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMaxUnpoolingLayerKernel(CLMaxUnpoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMaxUnpoolingLayerKernel &operator=(CLMaxUnpoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~CLMaxUnpoolingLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @note Output shape must be equal to the shape of the original input to pool.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices         Tensor containing the offset to store the input elements in the output tensor.
+     *                             @ref CLPoolingLayerKernel with indices should precede this function in order to
+     *                             properly reconstruct the output tensor.
+     *                             The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] indices   TensorInfo associated to the tensor containing the offset to store the input elements in the output tensor.
+     *                      @ref CLPoolingLayerKernel with indices should precede this function in order to
+     *                      properly reconstruct the output tensor.
+     *                      The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden
+    void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    const ICLTensor *_input;
+    ICLTensor       *_output;
+    const ICLTensor *_indices;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMAXUNPOOLINGLAYERKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h b/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
index 96b4c4e..2a5a5f2 100644
--- a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
+++ b/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
index ff0c96e..ff5e9ab 100644
--- a/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ b/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h b/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
index c68ab07..ccb4753 100644
--- a/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
+++ b/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLMemsetKernel.h b/arm_compute/core/CL/kernels/CLMemsetKernel.h
index 430bc1d..5bda480 100644
--- a/arm_compute/core/CL/kernels/CLMemsetKernel.h
+++ b/arm_compute/core/CL/kernels/CLMemsetKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h b/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h
index 5f9685f..a693cfd 100644
--- a/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h b/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
index afb134f..fbcf697 100644
--- a/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
+++ b/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h b/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
index 1f33735..cee6448 100644
--- a/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
+++ b/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h b/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
index a256bc7..d1bba4f 100644
--- a/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
+++ b/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
index 2511818..6233d28 100644
--- a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
index d247e1f..2e2e60d 100644
--- a/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/arm_compute/core/CL/kernels/CLPadLayerKernel.h
index 166c202..5bf5841 100644
--- a/arm_compute/core/CL/kernels/CLPadLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPadLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_CLPADLAYERKERNEL_H
 
 #include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
 
 namespace arm_compute
 {
@@ -49,7 +48,7 @@
     ~CLPadLayerKernel() = default;
     /** Set the input and output tensor.
      *
-     * @param[in]  input          Source tensor. Data types supported: U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
+     * @param[in]  input          Source tensor. Data types supported: All.
      * @param[out] output         Output tensor. Data type supported: same as @p input
      * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
      *                            specifies the front and the end padding in the i-th dimension.
@@ -73,7 +72,7 @@
                    PaddingMode mode = PaddingMode::CONSTANT);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
      *
-     * @param[in] input          Source tensor info. Data types supported: U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
+     * @param[in] input          Source tensor info. Data types supported: All.
      * @param[in] output         Output tensor info. Data type supported: same as @p input
      * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
      *                           specifies the front and the end padding in the i-th dimension.

diff --git a/arm_compute/core/CL/kernels/CLPermuteKernel.h b/arm_compute/core/CL/kernels/CLPermuteKernel.h
index 1a9240e..bb841b1 100644
--- a/arm_compute/core/CL/kernels/CLPermuteKernel.h
+++ b/arm_compute/core/CL/kernels/CLPermuteKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
index 52a09d9..6b5bd11 100644
--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,52 +48,78 @@
     CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
     /** Initialise the kernel's input, output and border mode.
      *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
-     * @param[out] output          The output tensor, Data types supported:
-     *                             - U8, only if both input are U8
-     *                             - QASYMM8, only if both inputs are QASYMM8
-     *                             - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
-     *                             - S16
-     *                             - QSYMM16, only if both inputs are QSYMM16
-     *                             - S32, only if both inputs are QSYMM16
-     *                             - F16
-     *                             - F32
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @param[in]  input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]  input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[out] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
      * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
                    ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's input, output and border mode.
      *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
-     * @param[out] output          The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
+     * @param[in]  input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]  input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[out] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
      * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
                    ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel
      *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
      * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
-     * @param[in] output          The output tensor info, Data types supported:
-     *                            - U8, only if both input are U8
-     *                            - QASYMM8, only if both inputs are QASYMM8
-     *                            - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
-     *                            - S16
-     *                            - QSYMM16, only if both inputs are QSYMM16
-     *                            - S32, only if both inputs are QSYMM16
-     *                            - F16
-     *                            - F32
+     * @param[in] input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
@@ -106,13 +132,13 @@
                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
+    const ITensorInfo *_input1;
+    const ITensorInfo *_input2;
+    ITensorInfo       *_output;
 };
 
 /** Interface for the complex pixelwise multiplication kernel. */
@@ -131,21 +157,21 @@
     CLComplexPixelWiseMultiplicationKernel &operator=(CLComplexPixelWiseMultiplicationKernel &&) = default;
     /** Initialise the kernel's input, output and border mode.
      *
-     * @param[in]  input1   An input tensor. Data types supported: F32. Number of channels supported: 2.
-     * @param[in]  input2   An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[out] output   The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in]  input1   An input tensor info. Data types supported: F32. Number of channels supported: 2.
+     * @param[in]  input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[out] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          An input tensor. Data types supported: F32. Number of channels supported: 2.
-     * @param[in]  input2          An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[out] output          The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in]  input1          An input tensor info. Data types supported: F32. Number of channels supported: 2.
+     * @param[in]  input2          An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[out] output          The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplicationKernel
      *
      * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2.
@@ -158,13 +184,13 @@
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
+    const ITensorInfo *_input1;
+    const ITensorInfo *_input2;
+    ITensorInfo       *_output;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
index 3957504..85585e4 100644
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h b/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h
index 5fd27d9..b4a69ac 100644
--- a/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
index 2d47072..51c50bc 100644
--- a/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ b/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
index de30447..b0144bf 100644
--- a/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h b/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h
index 30bdbb1..6a0468d 100644
--- a/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h
index ea70a58..ee422e1 100644
--- a/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLRangeKernel.h b/arm_compute/core/CL/kernels/CLRangeKernel.h
index fc8db98..b5c64b2 100644
--- a/arm_compute/core/CL/kernels/CLRangeKernel.h
+++ b/arm_compute/core/CL/kernels/CLRangeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
index 0b0b4ae..2ecd1c9 100644
--- a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
+++ b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLRemapKernel.h b/arm_compute/core/CL/kernels/CLRemapKernel.h
index f3d1511..fd261cd 100644
--- a/arm_compute/core/CL/kernels/CLRemapKernel.h
+++ b/arm_compute/core/CL/kernels/CLRemapKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLReorgLayerKernel.h b/arm_compute/core/CL/kernels/CLReorgLayerKernel.h
index 9c06485..e3edc9f 100644
--- a/arm_compute/core/CL/kernels/CLReorgLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLReorgLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@
     CLReorgLayerKernel &operator=(CLReorgLayerKernel &&) = default;
     /** Initialize the kernel's input, output.
      *
-     * @param[in]  input  Source tensor. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
+     * @param[in]  input  Source tensor. Data types supported: All.
      * @param[out] output Destination tensor with tensor shape:
      *                    [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
      *                    the same number of input elements. Data types supported: same as @p input.
@@ -58,7 +58,7 @@
     /** Initialize the kernel's input, output.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
+     * @param[in]  input           Source tensor. Data types supported: All.
      * @param[out] output          Destination tensor with tensor shape:
      *                             [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
      *                             the same number of input elements. Data types supported: same as @p input.

diff --git a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
index 3ea7411..6e3f255 100644
--- a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,31 +35,13 @@
 class CLReshapeLayerKernel : public ICLKernel
 {
 public:
-    /** Default constructor */
-    CLReshapeLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReshapeLayerKernel(const CLReshapeLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReshapeLayerKernel &operator=(const CLReshapeLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLReshapeLayerKernel(CLReshapeLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLReshapeLayerKernel &operator=(CLReshapeLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLReshapeLayerKernel() = default;
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: All.
-     * @param[out] output Destination tensor. Data type supported: Same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
     /** Set the input and output of the kernel
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data type supported: All.
-     * @param[out] output          Destination tensor. Data type supported: Same as @p input
+     * @param[in]  input           Source tensor info. Data type supported: All.
+     * @param[out] output          Destination tensor info. Data type supported: Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReshapeLayerKernel
      *
@@ -71,11 +53,7 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;  /**< Source tensor */
-    ICLTensor       *_output; /**< Destination tensor */
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLRESHAPELAYERKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLReverseKernel.h b/arm_compute/core/CL/kernels/CLReverseKernel.h
index e8f4507..17f1a4a 100644
--- a/arm_compute/core/CL/kernels/CLReverseKernel.h
+++ b/arm_compute/core/CL/kernels/CLReverseKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
index 328578d..79f7ed1 100644
--- a/arm_compute/core/CL/kernels/CLScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLScaleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_CLSCALEKERNEL_H
 
 #include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
+#include "arm_compute/core/KernelDescriptors.h"
 
 namespace arm_compute
 {
@@ -37,43 +37,32 @@
 public:
     /** Initialise the kernel's inputs, output and interpolation policy
      *
-     * @param[in]  input           Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input
-     *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  policy          Interpolation type to use
-     * @param[in]  border_mode     Selected border mode.
-     * @param[in]  sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]  align_corners   (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     * @param[in]  input  Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[out] output Destination tensor. Data types supported: Same as @p input
+     *                    All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  info   @ref ScaleKernelInfo Kernel descriptor to be used to configure.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool align_corners = false);
+    void configure(const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
     /** Initialise the kernel's inputs, output and interpolation policy
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
      * @param[out] output          Destination tensor. Data types supported: Same as @p input
      *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  policy          Interpolation type to use
-     * @param[in]  border_mode     Selected border mode.
-     * @param[in]  sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]  align_corners   (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     * @param[in]  info            @ref ScaleKernelInfo Kernel descriptor to be used to configure.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode,
-                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool align_corners = false);
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLScaleKernel
      *
-     * @param[in] input           Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] output          Destination tensor info. Data types supported: Same as @p input
-     *                            All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] policy          Interpolation type to use
-     * @param[in] border_mode     Selected border mode.
-     * @param[in] sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in] align_corners   (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     * @param[in] input  Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] output Destination tensor info. Data types supported: Same as @p input
+     *                   All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in] info   @ref ScaleKernelInfo Kernel descriptor to be used to validate
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy = SamplingPolicy::CENTER,
-                           bool align_corners = false);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info);
     /** Input tensor accessor.
      *
      * @return Pointer to input tensor.
@@ -89,10 +78,16 @@
     BorderSize border_size() const override;
     void run(const Window &window, cl::CommandQueue &queue) override;
 
-public:
-    InterpolationPolicy _interpolationPolicy = InterpolationPolicy::BILINEAR;
-    DataLayout          _data_layout         = DataLayout::UNKNOWN;
-    bool                _align_corners       = false;
+    // Getter for interpolation policy
+    InterpolationPolicy get_interpolation_policy() const
+    {
+        return _interpolation_policy;
+    }
+
+private:
+    InterpolationPolicy _interpolation_policy = InterpolationPolicy::BILINEAR;
+    DataLayout          _data_layout          = DataLayout::UNKNOWN;
+    bool                _align_corners        = false;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLSCALEKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h b/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
index 209a150..1af56a7 100644
--- a/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
+++ b/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLSelectKernel.h b/arm_compute/core/CL/kernels/CLSelectKernel.h
index 5cbd985..4015a27 100644
--- a/arm_compute/core/CL/kernels/CLSelectKernel.h
+++ b/arm_compute/core/CL/kernels/CLSelectKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h b/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
index 4240fe8..e247678 100644
--- a/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
+++ b/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h b/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
index ef30f0e..82831ed 100644
--- a/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
+++ b/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h b/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
index 4eda5a4..d55993d 100644
--- a/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
+++ b/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
index b174f49..f8c1019 100644
--- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,93 +27,10 @@
 #include "arm_compute/core/CL/ICLSimple3DKernel.h"
 #include "arm_compute/core/KernelDescriptors.h"
 
-#include <tuple>
-
 namespace arm_compute
 {
 class ICLTensor;
 
-/** Interface for the identifying the max value of 1D Logits */
-class CLLogits1DMaxKernel : public ICLSimple3DKernel
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[out] output          Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in] output Destination tensor. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-
-/** Interface for shifting, exponentiating and summing the logits */
-class CLLogits1DShiftExpSumKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLogits1DShiftExpSumKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DShiftExpSumKernel(const CLLogits1DShiftExpSumKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DShiftExpSumKernel &operator=(const CLLogits1DShiftExpSumKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLogits1DShiftExpSumKernel(CLLogits1DShiftExpSumKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in]  max    Max values tensor. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[out] sum    Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.0
-     */
-    void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in]  max             Max values tensor. Data types supported: same as @p input
-     * @param[out] output          Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[out] sum             Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[in]  beta            (Optional) A scaling factor for the exponent. Defaults to 1.0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DShiftExpSumKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in] max    Max values tensor. Data types supported: same as @p input
-     * @param[in] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[in] sum    Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_max;
-    ICLTensor       *_output;
-    ICLTensor       *_sum;
-};
-
 /** Interface for max, shifting, exponentiating and summing the logits */
 class CLLogits1DMaxShiftExpSumKernel : public ICLKernel
 {
@@ -134,7 +51,7 @@
     CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default;
     /** Set the input and output tensors.
      *
-     * @param[in]     input  Source tensor. Data types supported: F16/F32
+     * @param[in]     input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
      * @param[in,out] max    Max values tensor. Data types supported: same as @p input
      * @param[out]    output Destination tensor. Data types supported: same as @p input
      * @param[out]    sum    Sum of 1D logits tensor. Data types supported: same as @p input
@@ -144,7 +61,7 @@
     /** Set the input and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: F16/F32
+     * @param[in]     input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
      * @param[in,out] max             Max values tensor. Data types supported: same as @p input
      * @param[out]    output          Destination tensor. Data types supported: same as @p input
      * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p input
@@ -153,7 +70,7 @@
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxShiftExpSumKernel
      *
-     * @param[in] input  Source tensor. Data types supported: F16/F32
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
      * @param[in] max    Max values tensor. Data types supported: same as @p input
      * @param[in] output Destination tensor. Data types supported: same as @p input
      * @param[in] sum    Sum of 1D logits tensor. Data types supported: same as @p input
@@ -203,24 +120,24 @@
     CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: S32/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
      * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
+     * @param[out] output Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
      * @param[in]  info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
     void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: S32/F16/F32
+     * @param[in]  input           Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
      * @param[in]  sum             Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output          Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
+     * @param[out] output          Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
      * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DNormKernel
      *
-     * @param[in] input  Source tensor. Data types supported: S32/F16/F32
+     * @param[in] input  Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported.
      * @param[in] sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
      * @param[in] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
      * @param[in] info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.

diff --git a/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h
index 799b7b1..93221f7 100644
--- a/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h
index f2371e7..af0aa12 100644
--- a/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLStackLayerKernel.h b/arm_compute/core/CL/kernels/CLStackLayerKernel.h
index e11c0a3..cfefcd9 100644
--- a/arm_compute/core/CL/kernels/CLStackLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLStackLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
index ebe1b38..74311b7 100644
--- a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+++ b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,49 +31,17 @@
 
 namespace arm_compute
 {
-// Forward declarations
-class ICLTensor;
-
 /** Interface for the kernel to perform tensor strided slicing */
 class CLStridedSliceKernel : public ICLKernel
 {
 public:
-    /** Default constructor */
-    CLStridedSliceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStridedSliceKernel(const CLStridedSliceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLStridedSliceKernel(CLStridedSliceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default;
-    /** Default destructor */
-    ~CLStridedSliceKernel() = default;
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  input            Source tensor. Data type supported: All.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
-     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
      *
      * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data type supported: All.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
+     * @param[in]  input            Source tensor info. Data type supported: All.
+     * @param[out] output           Destination tensor info. Data type supported: Same as @p input
      * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
@@ -82,7 +50,7 @@
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
                    const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                    int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
 
@@ -105,11 +73,7 @@
                            int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;  /**< Source tensor */
-    ICLTensor       *_output; /**< Destination tensor */
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLTableLookupKernel.h b/arm_compute/core/CL/kernels/CLTableLookupKernel.h
index 24e333f..9f1d28c 100644
--- a/arm_compute/core/CL/kernels/CLTableLookupKernel.h
+++ b/arm_compute/core/CL/kernels/CLTableLookupKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLThresholdKernel.h b/arm_compute/core/CL/kernels/CLThresholdKernel.h
index 3db4870..7e01fd6 100644
--- a/arm_compute/core/CL/kernels/CLThresholdKernel.h
+++ b/arm_compute/core/CL/kernels/CLThresholdKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,45 +25,33 @@
 #define ARM_COMPUTE_CLTHRESHOLDKERNEL_H
 
 #include "arm_compute/core/CL/ICLSimple2DKernel.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 
-#include <cstdint>
-
 namespace arm_compute
 {
+// Forward declarations
 class ICLTensor;
 
-/** Interface for the thresholding kernel.
- *
- */
+/** Interface for the thresholding kernel. */
 class CLThresholdKernel : public ICLSimple2DKernel
 {
 public:
     /**Initialise the kernel's input, output and threshold parameters.
      *
-     * @param[in]  input       An input tensor. Data types supported: U8
-     * @param[out] output      The output tensor. Data types supported: U8.
-     * @param[in]  threshold   Threshold. When the threshold type is RANGE, this is used as the lower threshold.
-     * @param[in]  false_value value to set when the condition is not respected.
-     * @param[in]  true_value  value to set when the condition is respected.
-     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
-     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
+     * @param[in]  input  An input tensor. Data types supported: U8
+     * @param[out] output The output tensor. Data types supported: U8.
+     * @param[in]  info   Threshold descriptor
      */
-    void configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                   uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
+    void configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
     /**Initialise the kernel's input, output and threshold parameters.
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           An input tensor. Data types supported: U8
      * @param[out] output          The output tensor. Data types supported: U8.
-     * @param[in]  threshold       Threshold. When the threshold type is RANGE, this is used as the lower threshold.
-     * @param[in]  false_value     value to set when the condition is not respected.
-     * @param[in]  true_value      value to set when the condition is respected.
-     * @param[in]  type            Thresholding type. Either RANGE or BINARY.
-     * @param[in]  upper           Upper threshold. Only used when the thresholding type is RANGE.
+     * @param[in]  info            Threshold descriptor
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                   uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLTileKernel.h b/arm_compute/core/CL/kernels/CLTileKernel.h
index 68f3c92..56e1df8 100644
--- a/arm_compute/core/CL/kernels/CLTileKernel.h
+++ b/arm_compute/core/CL/kernels/CLTileKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLTransposeKernel.h b/arm_compute/core/CL/kernels/CLTransposeKernel.h
index 09c9e3b..4a9887f 100644
--- a/arm_compute/core/CL/kernels/CLTransposeKernel.h
+++ b/arm_compute/core/CL/kernels/CLTransposeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h b/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
index e6b4209..b523b97 100644
--- a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  input             Source tensor. Data types supported: All.
      * @param[out] output            Destination tensor. Data types supported: same as @p input.
      * @param[in]  info              Contains stride information described in @ref Size2D.
      * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
@@ -58,7 +58,7 @@
     /** Initialise the kernel's input and output.
      *
      * @param[in]  compile_context   The compile context to be used.
-     * @param[in]  input             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  input             Source tensor. Data types supported: All.
      * @param[out] output            Destination tensor. Data types supported: same as @p input.
      * @param[in]  info              Contains stride information described in @ref Size2D.
      * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
@@ -66,7 +66,7 @@
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy);
     /** Static function to check if given info will lead to a valid configuration of @ref CLUpsampleLayerKernel
      *
-     * @param[in] input             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] input             Source tensor info. Data types supported: All.
      * @param[in] output            Destination tensor info. Data types supported: same as @p input.
      * @param[in] info              Contains  stride information described in @ref Size2D.
      * @param[in] upsampling_policy Defines the policy to fill the intermediate pixels.

diff --git a/arm_compute/core/CL/kernels/CLWarpAffineKernel.h b/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
index a21325e..440feba 100644
--- a/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
+++ b/arm_compute/core/CL/kernels/CLWarpAffineKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h b/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
index bb1a018..6614989 100644
--- a/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
+++ b/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
index 47e987b..c74255b 100644
--- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
+++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
index a39ccc2..a379b5f 100644
--- a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
+++ b/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,6 @@
 
 namespace arm_compute
 {
-class ICLTensor;
-
 /** Interface for the width concatenate kernel of 2 tensors.
  *  The input1 and input2 tensors will be concatenated into the output tensor.
  */
@@ -39,7 +37,7 @@
 {
 public:
     /** Default constructor */
-    CLWidthConcatenate2TensorsKernel();
+    CLWidthConcatenate2TensorsKernel() = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLWidthConcatenate2TensorsKernel(const CLWidthConcatenate2TensorsKernel &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -52,19 +50,12 @@
     ~CLWidthConcatenate2TensorsKernel() = default;
     /** Initialise the kernel's input1s and output
      *
-     * @param[in]  input1 First input tensor. Data types supported: All.
-     * @param[in]  input2 Second input tensor. Data types supported: same as @p input1
-     * @param[out] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Initialise the kernel's input1s and output
-     *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input1          First input tensor. Data types supported: All.
      * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
      * @param[out] output          Output tensor. Data types supported: Same as @p input1.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate2TensorsKernel
      *
      * @param[in] input1 First tensor info. Data types supported: All.
@@ -76,12 +67,7 @@
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
index 0e0eae6..6b0e8ee 100644
--- a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
+++ b/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,6 @@
 
 namespace arm_compute
 {
-class ICLTensor;
-
 /** Interface for the width concatenate kernel of 4 tensors.
  *  All input tensors will be concatenated into the output tensor.
  */
@@ -52,15 +50,6 @@
     ~CLWidthConcatenate4TensorsKernel() = default;
     /** Initialise the kernel's input1s and output
      *
-     * @param[in]  input1 First input tensor. Data types supported: All.
-     * @param[in]  input2 Second input tensor. Data types supported: same as @p input1
-     * @param[in]  input3 Third input tensor. Data types supported: same as @p input1
-     * @param[in]  input4 Fourth input tensor. Data types supported: same as @p input1
-     * @param[out] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output);
-    /** Initialise the kernel's input1s and output
-     *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input1          First input tensor. Data types supported: All.
      * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
@@ -68,7 +57,7 @@
      * @param[in]  input4          Fourth input tensor. Data types supported: same as @p input1
      * @param[out] output          Output tensor. Data types supported: Same as @p input1.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *input3, ITensorInfo *input4, ITensorInfo *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate4TensorsKernel
      *
      * @param[in] input1 First tensor info. Data types supported: All.
@@ -82,14 +71,7 @@
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    const ICLTensor *_input3;
-    const ICLTensor *_input4;
-    ICLTensor       *_output;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
index ef5851f..32e90af 100644
--- a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,6 @@
 
 namespace arm_compute
 {
-class ICLTensor;
-
 /** Interface for the width concatenate kernel.
  *  The input tensor will be concatenated into the output tensor.
  */
@@ -52,21 +50,13 @@
     ~CLWidthConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: All.
-     * @param[in]     width_offset The offset on the X axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
      * @param[in]     compile_context The compile context to be used.
      * @param[in]     input           Input tensor. Data types supported: All.
      * @param[in]     width_offset    The offset on the X axis.
      * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int width_offset, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int width_offset, ITensorInfo *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenateLayerKernel
      *
      * @param[in] input        Input tensor info. Data types supported: All.
@@ -78,12 +68,10 @@
     static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _width_offset;
+    unsigned int _width_offset;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H */

diff --git a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
index 5b2dc8c..b689be8 100644
--- a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
+++ b/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
index a305126..4f198f0 100644
--- a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
+++ b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
index 512b352..f7cbd05 100644
--- a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
+++ b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h b/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h
index d0c4a9e..52b0698 100644
--- a/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
index f0f7754..d182e38 100644
--- a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h
index c7b40ba..7187613 100644
--- a/arm_compute/core/CPP/CPPKernels.h
+++ b/arm_compute/core/CPP/CPPKernels.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index d3f6fc9..fd6bfc3 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,9 @@
     GENERIC_FP16_DOT,
     A53,
     A55r0,
-    A55r1
+    A55r1,
+    X1,
+    A73
 };
 
 /** Global memory policy.
@@ -94,6 +96,14 @@
         {
             return std::string("A55r1");
         }
+        case CPUModel::X1:
+        {
+            return std::string("X1");
+        }
+        case CPUModel::A73:
+        {
+            return std::string("A73");
+        }
         default:
         {
             ARM_COMPUTE_ERROR("Invalid CPUModel.");

diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
index ec05af2..ab369ff 100644
--- a/arm_compute/core/CPP/ICPPKernel.h
+++ b/arm_compute/core/CPP/ICPPKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,13 @@
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
 
 namespace arm_compute
 {
 class Window;
+class ITensor;
 
 /** Common interface for all kernels implemented in C++ */
 class ICPPKernel : public IKernel
@@ -51,8 +54,7 @@
      */
     virtual void run(const Window &window, const ThreadInfo &info)
     {
-        ARM_COMPUTE_UNUSED(window);
-        ARM_COMPUTE_UNUSED(info);
+        ARM_COMPUTE_UNUSED(window, info);
         ARM_COMPUTE_ERROR("default implementation of legacy run() virtual member function invoked");
     }
 
@@ -69,6 +71,23 @@
         run(window, info);
     }
 
+    /** Execute the kernel on the passed window
+     *
+     * @warning If is_parallelisable() returns false then the passed window must be equal to window()
+     *
+     * @note The window has to be a region within the window returned by the window() method
+     *
+     * @note The width of the window has to be a multiple of num_elems_processed_per_iteration().
+     *
+     * @param[in] tensors A vector containing the tensors to operate on.
+     * @param[in] window  Region on which to execute the kernel. (Must be a region of the window returned by window())
+     * @param[in] info    Info about executing thread and CPU.
+     */
+    virtual void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+    {
+        ARM_COMPUTE_UNUSED(tensors, window, info);
+    }
+
     /** Name of the kernel
      *
      * @return Kernel name

diff --git a/arm_compute/core/CPP/ICPPSimpleKernel.h b/arm_compute/core/CPP/ICPPSimpleKernel.h
index acdd054..c31d487 100644
--- a/arm_compute/core/CPP/ICPPSimpleKernel.h
+++ b/arm_compute/core/CPP/ICPPSimpleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CPP/Validate.h b/arm_compute/core/CPP/Validate.h
index dfee9de..9e95f72 100644
--- a/arm_compute/core/CPP/Validate.h
+++ b/arm_compute/core/CPP/Validate.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
index 3fa83a6..1a3f2ba 100644
--- a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H
 #define ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H
 
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/IHOG.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
@@ -62,7 +60,7 @@
      * @param[out] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
      * @param[out] batch_splits_out (Optional) The batch splits output tensor [batch_size]. Data types supported: Same as @p scores_in
      * @param[out] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as@p scores_in
-     * @param[out] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in
+     * @param[out] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32
      * @param[in]  info             (Optional) BoxNMSLimitInfo information.
      */
     void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,

diff --git a/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h b/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
index eeb6a65..ddb346d 100644
--- a/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
index cf8e4f0..dd6bbd5 100644
--- a/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
index cb416af..e32b5d8 100644
--- a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
index e75152f..d141c2f 100644
--- a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,15 +56,15 @@
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to permute. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32
-     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  input  The input tensor to permute. Data types supported: All.
+     * @param[out] output The output tensor. Data types supported: same as @p input
      * @param[in]  perm   Permutation vector
      */
     void configure(const ITensor *input, ITensor *output, const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel
      *
-     * @param[in] input  The input tensor to permute. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32
-     * @param[in] output The output tensor. Data types supported: Same as @p input
+     * @param[in] input  The input tensor to permute. Data types supported: All.
+     * @param[in] output The output tensor. Data types supported: same as @p input
      * @param[in] perm   Permutation vector
      *
      * @return a status

diff --git a/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h b/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
index d127ef8..be4076f 100644
--- a/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
index 4b9bfdd..1245dbc 100644
--- a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,7 @@
     /** Set the input and output of the kernel.
      *
      * @param[in]  predictions A batch_size x classes tensor. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED
-     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: S32
+     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: U32
      * @param[out] output      Computed precision at @p k as a bool 1D tensor. Data types supported: U8
      * @param[in]  k           Number of top elements to look at for computing precision.
      */
@@ -63,7 +63,7 @@
     /** Static function to check if given info will lead to a valid configuration of @ref CPPTopKVKernel
      *
      * @param[in] predictions A batch_size x classes tensor info. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED
-     * @param[in] targets     A batch_size 1D tensor info of class ids. Data types supported: S32
+     * @param[in] targets     A batch_size 1D tensor info of class ids. Data types supported: U32
      * @param[in] output      Computed precision at @p k as a bool 1D tensor info. Data types supported: U8
      * @param[in] k           Number of top elements to look at for computing precision.
      *

diff --git a/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h b/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h
index 9fbc9b6..dd7e07c 100644
--- a/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,8 +55,8 @@
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED
-     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  input  The input tensor to upsample. Data types supported: All.
+     * @param[out] output The output tensor. Data types supported: same as @p input.
      * @param[in]  info   Padding info.
      */
     void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);

diff --git a/arm_compute/core/Coordinates.h b/arm_compute/core/Coordinates.h
index 78ca525..f6e1f4d 100644
--- a/arm_compute/core/Coordinates.h
+++ b/arm_compute/core/Coordinates.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h
index fbaef3a..960238c 100644
--- a/arm_compute/core/Dimensions.h
+++ b/arm_compute/core/Dimensions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
index dd3e888..992d6bc 100644
--- a/arm_compute/core/Error.h
+++ b/arm_compute/core/Error.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h b/arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h
index 9706c9b..5c8f5d0 100644
--- a/arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h
+++ b/arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/GCHelpers.h b/arm_compute/core/GLES_COMPUTE/GCHelpers.h
index b1a9ab3..194d49d 100644
--- a/arm_compute/core/GLES_COMPUTE/GCHelpers.h
+++ b/arm_compute/core/GLES_COMPUTE/GCHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h b/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
index 0f6daf7..fc39bef 100644
--- a/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
+++ b/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/GCKernels.h b/arm_compute/core/GLES_COMPUTE/GCKernels.h
index a1537ec..1b94ae3 100644
--- a/arm_compute/core/GLES_COMPUTE/GCKernels.h
+++ b/arm_compute/core/GLES_COMPUTE/GCKernels.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/IGCKernel.h b/arm_compute/core/GLES_COMPUTE/IGCKernel.h
index 7b2aad7..7891b98 100644
--- a/arm_compute/core/GLES_COMPUTE/IGCKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/IGCKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h
index ae8fd40..3b2cfa1 100644
--- a/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h
index 40a21ee..9ccd135 100644
--- a/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h
index c0f561a..e49941d 100644
--- a/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/IGCTensor.h b/arm_compute/core/GLES_COMPUTE/IGCTensor.h
index c382095..080947d 100644
--- a/arm_compute/core/GLES_COMPUTE/IGCTensor.h
+++ b/arm_compute/core/GLES_COMPUTE/IGCTensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/OpenGLES.h b/arm_compute/core/GLES_COMPUTE/OpenGLES.h
index 4454436..0bd7bed 100644
--- a/arm_compute/core/GLES_COMPUTE/OpenGLES.h
+++ b/arm_compute/core/GLES_COMPUTE/OpenGLES.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h
index d55f98f..ff8943b 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h
index 65e018a..5d36768 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h
index 7e8159c..4cd56b5 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
index eb7a99c..998042c 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h
index d96fb56..6ec9da9 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
index 9c77549..52e8d72 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h
index 8faa54a..95708d4 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h
index 43f94f8..64a51af 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h
index e3dda67..b1338b8 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h
index 4dd7aa0..6852f90 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h
index cbc60da..c7d5181 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
index 95f991e..e7de8a3 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h
index e4157a1..8b78e0c 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h
index 4dcae2e..4b6e08f 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h
index 29a4c8d..b091159 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h
index 7d1a53c..38c924d 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h
index dd00cae..a942975 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h
index 5156da8..b8bd226 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h
index 0c4b656..f9d84b8 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
index 7a2fb84..a219077 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h
index 754f15c..7a2e40a 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_GCSCALEKERNEL_H
 
 #include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h"
-#include "arm_compute/core/Types.h"
+#include "arm_compute/core/KernelDescriptors.h"
 
 namespace arm_compute
 {
@@ -37,14 +37,12 @@
 public:
     /** Initialise the kernel's inputs, output and interpolation policy
      *
-     * @param[in]  input            Source tensor. Data types supported: F16
-     * @param[out] output           Destination tensor. Data types supported: Same as @p input
-     *                              All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  policy           Interpolation type to use
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  sampling_policy  (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
+     * @param[in]  input  Source tensor. Data types supported: F16
+     * @param[out] output Destination tensor. Data types supported: Same as @p input
+     *                    All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  info   @ref ScaleKernelInfo descriptor to be used to configure
      */
-    void configure(const IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy = SamplingPolicy::CENTER);
+    void configure(const IGCTensor *input, IGCTensor *output, const ScaleKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window) override;

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
index 280efe1..1b0d450 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
index 5243e54..72c3839 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h
index a981ae6..03f741b 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h
index 134346b..3b238bf 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/GPUTarget.h b/arm_compute/core/GPUTarget.h
index 4959ee5..06025ca 100644
--- a/arm_compute/core/GPUTarget.h
+++ b/arm_compute/core/GPUTarget.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/HOGInfo.h b/arm_compute/core/HOGInfo.h
index 3cc472b..7314d9a 100644
--- a/arm_compute/core/HOGInfo.h
+++ b/arm_compute/core/HOGInfo.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index 09c672e..d056f93 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -801,6 +801,16 @@
     return x >= 0 ? x % m : (x % m + m) % m;
 }
 
+/** Convert a dimension axis to the number of dimensions in the range [0, @p dim_axis]
+ * Handle negative axis, negative axis is used to specify axis from the end (e.g. -1 for the last axis).
+ *
+ * @param[in] dim_axis The last axis (inclusive) in the range [0, @p dim_axis]
+ * @param[in] num_dims The total number of dimensions
+ *
+ * @return The number of dimensions in the range [0, @p dim_axis]
+ */
+inline size_t dim_index_2_num_dims(int32_t dim_axis, int32_t num_dims);
+
 /** Convert negative coordinates to positive in the range [0, num_dims_input]
  *
  * @param[out] coords    Array of coordinates to be converted.

diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index 233d46b..07b4132 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,11 @@
 
 namespace arm_compute
 {
+inline size_t dim_index_2_num_dims(int32_t dim_axis, int32_t num_dims)
+{
+    return static_cast<size_t>(wrap_around(dim_axis, num_dims)) + 1;
+}
+
 inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
 {
     ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);

diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h
index 227d1c4..880f6d6 100644
--- a/arm_compute/core/IAccessWindow.h
+++ b/arm_compute/core/IAccessWindow.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/IArray.h b/arm_compute/core/IArray.h
index c6a1499..5f8b13d 100644
--- a/arm_compute/core/IArray.h
+++ b/arm_compute/core/IArray.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/IDevice.h b/arm_compute/core/IDevice.h
index 5cffe64..2a648ff 100644
--- a/arm_compute/core/IDevice.h
+++ b/arm_compute/core/IDevice.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/IDistribution.h b/arm_compute/core/IDistribution.h
index cd6f25f..dd511bf 100644
--- a/arm_compute/core/IDistribution.h
+++ b/arm_compute/core/IDistribution.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/IDistribution1D.h b/arm_compute/core/IDistribution1D.h
index 081ba58..28a38f4 100644
--- a/arm_compute/core/IDistribution1D.h
+++ b/arm_compute/core/IDistribution1D.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/IHOG.h b/arm_compute/core/IHOG.h
index bf8bd73..a21095a 100644
--- a/arm_compute/core/IHOG.h
+++ b/arm_compute/core/IHOG.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/IKernel.h b/arm_compute/core/IKernel.h
index cb1ddb1..11132f2 100644
--- a/arm_compute/core/IKernel.h
+++ b/arm_compute/core/IKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/ILut.h b/arm_compute/core/ILut.h
index d1a03af..bb0ae8b 100644
--- a/arm_compute/core/ILut.h
+++ b/arm_compute/core/ILut.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/IMultiHOG.h b/arm_compute/core/IMultiHOG.h
index ab79fac..6b93fcf 100644
--- a/arm_compute/core/IMultiHOG.h
+++ b/arm_compute/core/IMultiHOG.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/IMultiImage.h b/arm_compute/core/IMultiImage.h
index 3abdfed..672c2fe 100644
--- a/arm_compute/core/IMultiImage.h
+++ b/arm_compute/core/IMultiImage.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/IPyramid.h b/arm_compute/core/IPyramid.h
index b2a7465..58fb0b4 100644
--- a/arm_compute/core/IPyramid.h
+++ b/arm_compute/core/IPyramid.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h
index 501279e..272b83d 100644
--- a/arm_compute/core/ITensor.h
+++ b/arm_compute/core/ITensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index f2b4c15..c5f0949 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/ITensorPack.h b/arm_compute/core/ITensorPack.h
new file mode 100644
index 0000000..36b6aea
--- /dev/null
+++ b/arm_compute/core/ITensorPack.h

@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ITENSORPACK_H
+#define ARM_COMPUTE_ITENSORPACK_H
+
+#include <cstdint>
+#include <map>
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+
+/** Tensor packing service */
+class ITensorPack
+{
+private:
+    struct PackElement
+    {
+        PackElement() = default;
+        PackElement(ITensor *tensor)
+            : tensor(tensor), ctensor(nullptr)
+        {
+        }
+        PackElement(const ITensor *ctensor)
+            : tensor(nullptr), ctensor(ctensor)
+        {
+        }
+
+        ITensor       *tensor{ nullptr };
+        const ITensor *ctensor{ nullptr };
+    };
+
+public:
+    /** Default Constructor */
+    ITensorPack() = default;
+    /** Add tensor to the pack
+     *
+     * @param[in] id     ID/type of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor(int id, ITensor *tensor);
+
+    /** Add const tensor to the pack
+     *
+     * @param[in] id     ID/type of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor(int id, const ITensor *tensor);
+    /** Get tensor of a given id from the pac
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is non-const else nullptr
+     */
+    ITensor *get_tensor(int id);
+    /** Get constant tensor of a given id
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is const else nullptr
+     */
+    const ITensor *get_const_tensor(int id) const;
+    /** Pack size accessor
+     *
+     * @return Number of tensors registered to the pack
+     */
+    size_t size() const;
+    /** Checks if pack is empty
+     *
+     * @return True if empty else false
+     */
+    bool empty() const;
+
+private:
+    std::map<unsigned int, PackElement> _pack{}; /**< Container with the packed tensors */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_ITENSORPACK_H */

diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index 6b4691b..1ee1686 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H
 #define ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H
 
+#include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
@@ -168,5 +169,63 @@
     int32_t scalar{ 0 };            /**< Scalar value to multiply each reduced column/row by */
     bool    mul_by_scalar{ false }; /**< True if each column/row reduction has to be multiplied by a scalar value */
 };
+
+struct ScaleKernelInfo
+{
+    /** Constructor
+     *
+     * @param[in] interpolation_policy  Interpolation type to use
+     * @param[in] border_mode           Border mode policy
+     * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT and use_padding is set to false. Defaults to default @ref PixelValue
+     * @param[in] sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
+     * @param[in] use_padding           (Optional) Is padding in use or not. Defaults to true.
+     * @param[in] align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     */
+    ScaleKernelInfo(InterpolationPolicy interpolation_policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value = PixelValue(),
+                    SamplingPolicy      sampling_policy       = SamplingPolicy::CENTER,
+                    bool                use_padding           = true,
+                    bool                align_corners         = false)
+        : interpolation_policy{ interpolation_policy },
+          border_mode{ border_mode },
+          constant_border_value{ constant_border_value },
+          sampling_policy{ sampling_policy },
+          use_padding{ use_padding },
+          align_corners{ align_corners }
+    {
+    }
+
+    InterpolationPolicy interpolation_policy;  /**< Interpolation type to use */
+    BorderMode          border_mode;           /**< Border mode policy */
+    PixelValue          constant_border_value; /**< Constant value to use for constant border mode policy */
+    SamplingPolicy      sampling_policy;       /**< Sampling policy used by the interpolation. */
+    bool                use_padding;           /**< Indication of using padding */
+    bool                align_corners;         /**< Align corners of input and output */
+};
+
+struct ThresholdKernelInfo
+{
+    /** Default constructor */
+    ThresholdKernelInfo() = default;
+    /** Constructor
+     *
+     * @param[in] threshold   Threshold. When the threshold type is RANGE, this is used as the lower threshold.
+     * @param[in] false_value value to set when the condition is not respected.
+     * @param[in] true_value  value to set when the condition is respected.
+     * @param[in] type        Thresholding type. Either RANGE or BINARY.
+     * @param[in] upper       Upper threshold. Only used when the thresholding type is RANGE.
+     */
+    ThresholdKernelInfo(uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+        : threshold(threshold), false_value(false_value), true_value(true_value), type(type), upper(upper)
+    {
+    }
+
+    uint8_t       threshold{ 0 };
+    uint8_t       false_value{ 0 };
+    uint8_t       true_value{ 0 };
+    ThresholdType type{ ThresholdType::BINARY };
+    uint8_t       upper{ 0 };
+};
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H */

diff --git a/arm_compute/core/Log.h b/arm_compute/core/Log.h
index 1515557..bc0ecb8 100644
--- a/arm_compute/core/Log.h
+++ b/arm_compute/core/Log.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/MultiImageInfo.h b/arm_compute/core/MultiImageInfo.h
index fcd7ba7..0d776e1 100644
--- a/arm_compute/core/MultiImageInfo.h
+++ b/arm_compute/core/MultiImageInfo.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/INEKernel.h b/arm_compute/core/NEON/INEKernel.h
index c099723..87e17c8 100644
--- a/arm_compute/core/NEON/INEKernel.h
+++ b/arm_compute/core/NEON/INEKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/INESimpleKernel.h b/arm_compute/core/NEON/INESimpleKernel.h
index 5d9c1ec..abe15c1 100644
--- a/arm_compute/core/NEON/INESimpleKernel.h
+++ b/arm_compute/core/NEON/INESimpleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index e4f4250..d5d824e 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,24 +67,23 @@
 
 /** Performs final quantization step on 16 elements
  *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_u8                        Relu lower bound
- * @param max_u8                        Relu upper bound
+ * @param[in] in_s32                        Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_u8                        Relu lower bound
+ * @param[in] max_u8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
  *
  * @return Quantized values
  */
-template <bool is_bounded_relu>
-uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
-                                 int          result_fixedpoint_multiplier,
-                                 int32_t      result_shift,
-                                 int32x4_t    result_offset_after_shift_s32,
-                                 uint8x16_t   min_u8,
-                                 uint8x16_t   max_u8)
+inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
+                                        int          result_fixedpoint_multiplier,
+                                        int32_t      result_shift,
+                                        int32x4_t    result_offset_after_shift_s32,
+                                        uint8x16_t   min_u8,
+                                        uint8x16_t   max_u8,
+                                        bool         is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -150,24 +149,23 @@
 
 /** Performs final quantization step on 16 elements
  *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_s8                        Relu lower bound
- * @param max_s8                        Relu upper bound
+ * @param[in] in_s32                        Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8                        Relu lower bound
+ * @param[in] max_s8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
  *
  * @return Quantized values
  */
-template <bool is_bounded_relu>
-int8x16_t finalize_quantization(int32x4x4_t &in_s32,
-                                int          result_fixedpoint_multiplier,
-                                int32_t      result_shift,
-                                int32x4_t    result_offset_after_shift_s32,
-                                int8x16_t    min_s8,
-                                int8x16_t    max_s8)
+inline int8x16_t finalize_quantization(int32x4x4_t &in_s32,
+                                       int          result_fixedpoint_multiplier,
+                                       int32_t      result_shift,
+                                       int32x4_t    result_offset_after_shift_s32,
+                                       int8x16_t    min_s8,
+                                       int8x16_t    max_s8,
+                                       bool         is_bounded_relu)
 {
     if(result_shift < 0)
     {
@@ -225,24 +223,23 @@
 
 /** Performs final quantization step on 16 elements for symmetric quantization
  *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_s8                        Relu lower bound
- * @param max_s8                        Relu upper bound
+ * @param[in] in_s32                        Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8                        Relu lower bound
+ * @param[in] max_s8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
  *
  * @return Quantized values
  */
-template <bool   is_bounded_relu>
 inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
                                             const int32x4x4_t &result_fixedpoint_multiplier,
                                             const int32x4x4_t &result_shift,
                                             const int32x4_t   &result_offset_after_shift_s32,
                                             const int8x16_t   &min_s8,
-                                            const int8x16_t   &max_s8)
+                                            const int8x16_t   &max_s8,
+                                            const bool         is_bounded_relu)
 {
     const static int32x4_t one_s32 = vdupq_n_s32(1);
 
@@ -322,21 +319,19 @@
 
 /** Performs final quantization step on single element
  *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
  * @param[in] in_value                      Input to be quantized.
  * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
  * @param[in] result_shift                  Result shift parameter
  * @param[in] result_offset_after_shift_s32 Result offset parameter
  * @param[in] min_u8                        Relu lower bound
  * @param[in] max_u8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
  *
  * @return Quantized value
  */
-template <bool is_bounded_relu>
 inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
                                      int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                     uint8_t min_u8, uint8_t max_u8)
+                                     uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
@@ -367,21 +362,19 @@
 
 /** Performs final quantization step on single element
  *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
  * @param[in] in_value                      Input to be quantized.
  * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
  * @param[in] result_shift                  Result shift parameter
  * @param[in] result_offset_after_shift_s32 Result offset parameter
  * @param[in] min_s8                        Relu lower bound
  * @param[in] max_s8                        Relu upper bound
+ * @param[in] is_bounded_relu               Specified if a fused bounded relu should be applied
  *
  * @return Quantized value
  */
-template <bool is_bounded_relu>
 inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
                                     int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                    int8_t min_s8, int8_t max_s8)
+                                    int8_t min_s8, int8_t max_s8, bool is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 

diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl
index 71205e0..d211382 100644
--- a/arm_compute/core/NEON/NEAsymm.inl
+++ b/arm_compute/core/NEON/NEAsymm.inl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl
index 2cf52e5..9fc1be5 100644
--- a/arm_compute/core/NEON/NEColorConvertHelper.inl
+++ b/arm_compute/core/NEON/NEColorConvertHelper.inl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 3aff677..5758264 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 14e51d8..c2c2b25 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 38701f4..f5d3aec 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,7 +70,6 @@
 #include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
@@ -84,10 +83,8 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
@@ -106,6 +103,7 @@
 #include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
 #include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
@@ -149,7 +147,6 @@
 #include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
 
 #endif /* ARM_COMPUTE_NEKERNELS_H */

diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
index 8827bbf..b82a9a3 100644
--- a/arm_compute/core/NEON/NEMath.h
+++ b/arm_compute/core/NEON/NEMath.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index 032bfde..a1c3d41 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/NESymm.h b/arm_compute/core/NEON/NESymm.h
index d6c5a70..6dee870 100644
--- a/arm_compute/core/NEON/NESymm.h
+++ b/arm_compute/core/NEON/NESymm.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
index 7d35e40..894e927 100644
--- a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
+++ b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
index 367385d..2e9935c 100644
--- a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
index 82103b9..325647b 100644
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,6 +33,7 @@
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
 
 /** Interface for the activation layer kernel. */
@@ -57,12 +58,12 @@
      *
      * @note If the output tensor is a nullptr, the activation function will be performed in-place
      *
-     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
+     * @param[in, out] input           Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result
      *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     output          Destination tensor. Data type supported: same as @p input
+     * @param[out]     output          Destination tensor info. Data type supported: same as @p input
      * @param[in]      activation_info Activation layer information.
      */
-    void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
+    void configure(const ITensorInfo *input, ITensorInfo *output, ActivationLayerInfo activation_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
@@ -75,7 +76,7 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
     using ActivationFunction = ActivationLayerInfo::ActivationFunction;
@@ -83,36 +84,34 @@
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const Window &window);
+    using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
     /** Function to apply an activation function on a tensor.
      *
      * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-    activation(const Window &window);
+    activation(const ITensor *src, ITensor *dst, const Window &window);
     /** Function to apply an activation function on a tensor.
      *
      * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const Window &window);
+    typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const ITensor *src, ITensor *dst, const Window &window);
     /** Function to apply an activation function on a tensor.
      *
      * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qasymm8_signed_t>::value, void>::type activation(const Window &window);
+    typename std::enable_if<std::is_same<T, qasymm8_signed_t>::value, void>::type activation(const ITensor *src, ITensor *dst, const Window &window);
     /** Function to apply an activation function on a tensor.
      *
      * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type activation(const Window &window);
+    typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type activation(const ITensor *src, ITensor *dst, const Window &window);
 
 private:
-    ITensor                      *_input;
-    ITensor                      *_output;
     ActivationFunctionExecutorPtr _func;
     ActivationLayerInfo           _act_info;
 };

diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
index 36d257b..eece570 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,23 +61,24 @@
      *   - (S16,U8)          -> S16
      *   - (U8,S16)          -> S16
      *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
      *   - (F16,F16)         -> F16
      *   - (F32,F32)         -> F32
      *   - (QASYMM8,QASYMM8) -> QASYMM8
      *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
      *   - (QSYMM16,QSYMM16) -> QSYMM16
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]  input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[out] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
      * @param[in]  policy Overflow policy.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel
      *
-     * @param[in] input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
      * @param[in] policy Overflow policy.
      *
      * @return a status
@@ -85,24 +86,21 @@
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
     /** Common signature for all the specialised add functions
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
+     * @param[in]  input1 First input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/S32/F32
+     * @param[in]  input2 Second input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/S32/F32
+     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/S32/F32.
      * @param[in]  policy Overflow policy.
      * @param[in]  window Region on which to execute the kernel.
      */
     using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window);
     /** Add function to use for the particular tensor types passed to configure() */
-    AddFunction   *_func;
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    ConvertPolicy  _policy;
+    AddFunction *_func;
+    ConvertPolicy _policy;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
index 919c685..e3a41a2 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@
     /** Default destructor */
     ~NEArithmeticSubtractionKernel() = default;
 
-    /** Initialise the kernel's input, output and border mode.
+    /** Initialise the kernel's input and output.
      *
      * Valid configurations (Input1,Input2) -> Output :
      *
@@ -71,9 +71,21 @@
      * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
      * @param[in]  policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel
      *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                          -> U8
+     *   - (U8,U8)                          -> S16
+     *   - (QASYMM8, QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (S16,U8)                         -> S16
+     *   - (U8,S16)                         -> S16
+     *   - (S16,S16)                        -> S16
+     *   - (F16,F16)                        -> F16
+     *   - (F32,F32)                        -> F32
+     *
      * @note Convert policy cannot be WRAP if datatype is QASYMM8
      *
      * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
@@ -86,8 +98,7 @@
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
     /** Common signature for all the specialised sub functions
@@ -96,13 +107,12 @@
      * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
      * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
      * @param[in]  window Region on which to execute the kernel.
+     * @param[in]  is_sat Flag to indicate if the policy is SATURATE.
      */
-    using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+    using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window, bool is_sat);
     /** Sub function to use for the particular tensor types passed to configure() */
-    SubFunction   *_func;
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
+    SubFunction *_func;
+    ConvertPolicy _policy;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
index f943744..4788909 100644
--- a/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,15 +56,15 @@
     ~NEBatchConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: All.
+     * @param[in]     input        Input tensor info. Data types supported: All.
      * @param[in]     batch_offset The offset on axis # 3.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
+     * @param[in,out] output       Output tensor info. Data types supported: Same as @p input.
      *
      * @note: The output tensor's low two dimensions can't be smaller than the input one's.
      * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
      *
      */
-    void configure(const ITensor *input, unsigned int batch_offset, ITensor *output);
+    void configure(const ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEBatchConcatenateLayerKernel
      *
      * @param[in] input        Input tensor info. Data types supported: All.
@@ -76,15 +76,13 @@
     static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
     using BatchConcatFunction = void(const ITensor *in, ITensor *out, unsigned int batch_offset, const Window &window);
 
 private:
     BatchConcatFunction *_func;
-    const ITensor       *_input;
-    ITensor             *_output;
     unsigned int         _batch_offset;
 };
 } // namespace arm_compute

diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index d59ed7b..962d256 100644
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,6 +28,7 @@
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
 
 /** Interface for the batch normalization layer kernel.
@@ -97,40 +98,26 @@
     /** Configure execution function in case of fused activation **/
     void configure_fused();
 
-    /** Template function to run batch normalization on fp16
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp16_nchw(const Window &window);
-    /** Template function to run batch normalization on fp16 on tensors with NHWC format
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp16_nhwc(const Window &window);
     /** Template function to run batch normalization on fp32
      *
+     * @tparam T                Specialization data type
      * @tparam fused_activation Boolean that flags if its a fused activation or not
      * @tparam F                Activation function functor to run
      *
      * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp32_nchw(const Window &window);
+    template <typename T, bool fused_activation, typename F>
+    void batch_normalization_nchw(const Window &window);
     /** Template function to run batch normalization on fp32 on tensors with NHWC format
      *
+     * @tparam T                Specialization data type
      * @tparam fused_activation Boolean that flags if its a fused activation or not
      * @tparam F                Activation function functor to run
      *
      * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp32_nhwc(const Window &window);
+    template <typename T, bool fused_activation, typename F>
+    void batch_normalization_nhwc(const Window &window);
     /** Common signature for all the batch normalization functions
      *
      * @param[in] window Region on which to execute the kernel.

diff --git a/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
index 61e47b0..943577d 100644
--- a/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
index 7a77767..0e4c886 100644
--- a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
index 3fb8c08..a20fdae 100644
--- a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
index 5b53251..70db5fb 100644
--- a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
index 0d91205..91f24f1 100644
--- a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index e94f228..8b3953a 100644
--- a/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
index 448e33b..32e991e 100644
--- a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
index 1979c5b..c4e1f3e 100644
--- a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
+++ b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
index 8f01938..5d32aed 100644
--- a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
+++ b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
index 8d62016..debae24 100644
--- a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
+++ b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,8 +27,6 @@
 #include "arm_compute/core/NEON/INESimpleKernel.h"
 #include "arm_compute/core/Types.h"
 
-#include <cstdint>
-
 namespace arm_compute
 {
 class IMultiImage;
@@ -60,7 +58,7 @@
      *
      * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
      * @param[in]  channel Channel to extract.
-     * @param[out] output  Destination tensor. Format supported: u8
+     * @param[out] output  Destination tensor. Format supported: U8
      */
     void configure(const ITensor *input, Channel channel, ITensor *output);
     /** Set the input and output of the kernel

diff --git a/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h b/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h
index 71659c4..e5bce7e 100644
--- a/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
index 9aa1062..e988771 100644
--- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h
+++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
index 3059288..88c03b7 100644
--- a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
+++ b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
index d451919..dadf9e9 100644
--- a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
+++ b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
index 6ec2793..6c74a12 100644
--- a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
+++ b/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
index 2b271de..51a6333 100644
--- a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NECopyKernel.h b/arm_compute/core/NEON/kernels/NECopyKernel.h
index d2dbbae..ddd14c1 100644
--- a/arm_compute/core/NEON/kernels/NECopyKernel.h
+++ b/arm_compute/core/NEON/kernels/NECopyKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NECropKernel.h b/arm_compute/core/NEON/kernels/NECropKernel.h
index 557a7a8..b7e185f 100644
--- a/arm_compute/core/NEON/kernels/NECropKernel.h
+++ b/arm_compute/core/NEON/kernels/NECropKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
      * @note Supported tensor rank: up to 4
      * @note Padding not supported.
      *
-     * @param[in]  input               Source tensor. Data type supported: U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
+     * @param[in]  input               Source tensor. Data type supported: U8/U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
      * @param[in]  crop_boxes          Tensor containing all possible boxes used to crop the image, each represented by 4 normalized values.
      *                                 Data type supported: F32
      * @param[in]  box_ind             One dimensional tensor mapping the @p crop_box_ind to the index of the 3D image in @p input.
@@ -74,7 +74,7 @@
      * @note Supported tensor rank: up to 4
      * @note Padding not supported.
      *
-     * @param[in] input               Source tensor info. Data type supported: U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
+     * @param[in] input               Source tensor info. Data type supported: U8/U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
      * @param[in] crop_boxes          Tensor info for tensor containing all possible boxes used to crop the image. Data type supported: F32
      * @param[in] box_ind             Tensor info for the one dimensional tensor mapping the @p crop_box_ind to the index of the 3D image
      *                                in @p input. Data type supported: F32

diff --git a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
index 52442c3..e4fe81a 100644
--- a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
+++ b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
index 6690ac2..3b2b9a1 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,15 +56,15 @@
     ~NEDepthConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]     input        Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
+     * @param[in,out] output       Output tensor info. Data types supported: Same as @p input.
      *
      * @note: The output tensor's low two dimensions can't be smaller than the input one's.
      * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
      *
      */
-    void configure(const ITensor *input, unsigned int depth_offset, ITensor *output);
+    void configure(const ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEDepthConcatenateLayerKernel
      *
      * @param[in] input        Input tensor info. Data types supported:  QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -76,15 +76,13 @@
     static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
     using DepthConcatFunction = void(const ITensor *in, ITensor *out, unsigned int depth_offset, const Window &window);
 
 private:
     DepthConcatFunction *_func;
-    const ITensor       *_input;
-    ITensor             *_output;
     unsigned int         _depth_offset;
 };
 } // namespace arm_compute

diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
index 5cda320..e297fd7 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
index 0b64588..c497b2c 100644
--- a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
index 227ddb4..6712e91 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
index 9737c99..2e29234 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
index 3792fb3..7b97d06 100644
--- a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,13 +52,13 @@
     ~NEDequantizationLayerKernel() = default;
     /** Set input, output tensors.
      *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in]  input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDequantizationLayerKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data types supported: F16/F32.
      *
      * @return a status

diff --git a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
index 20aee9b..7a46a41 100644
--- a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEDilateKernel.h b/arm_compute/core/NEON/kernels/NEDilateKernel.h
index 00a954d..424cf54 100644
--- a/arm_compute/core/NEON/kernels/NEDilateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDilateKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index 4ae283d..4cb9c90 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
index b7632d7..165f5bd 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h b/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
index 61c25e1..47b8c3b 100644
--- a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,18 +57,18 @@
     /** Default destructor */
     ~NEElementwiseOperationKernel() = default;
 
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
     /** Common signature for all the specialised arithmetic functions
      *
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Dependent on subclass.
-     * @param[in] window Region on which to execute the kernel.
+     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: Dependent on subclass.
+     * @param[in]  window Region on which to execute the kernel.
      */
     using ElementwiseFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
 
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+
 protected:
     /** Validate the argument passed to the kernel
      *
@@ -81,7 +81,7 @@
     /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
      *
      */
-    void configure_common(const ITensor *input1, const ITensor *input2, ITensor *output);
+    void configure_common(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
     /** Function to use for the particular tensor types passed to configure() */
     std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)> _function;
@@ -99,12 +99,12 @@
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
      *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
+     * @param[in]  op     Arithmetic operation to be executed.
+     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
      */
-    void configure(ArithmeticOperation op, const ITensor *input1, const ITensor *input2, ITensor *output);
+    void configure(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
      *
@@ -130,11 +130,11 @@
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
      *
-     * @param[in] input1 First tensor input. Data types supported: F16/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
+     * @param[in]  input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
      *
@@ -159,17 +159,17 @@
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
      *
-     * @param[in]  input1 First tensor input. Data types supported: F16/F32.
-     * @param[in]  input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
      * @param[in]  input1 First tensor input info. Data types supported: F16/F32.
      * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
      * @param[out] output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
      *
      * @return a Status
      */
@@ -188,19 +188,19 @@
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
      *
-     * @param[in] op     Comparison operation to be executed.
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: U16/U32.
+     * @param[in]  op     Comparison operation to be executed.
+     * @param[in]  input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out] output Output tensor info. Data types supported: U8.
      */
-    void configure(ComparisonOperation op, const ITensor *input1, const ITensor *input2, ITensor *output);
+    void configure(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
      *
      * @param[in] op     Comparison operation to be executed.
      * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
+     * @param[in] output Output tensor info. Data types supported: U8.
      *
      * @return a Status
      */

diff --git a/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h b/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
index 2a4a8f8..7f9d7ad 100644
--- a/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
+++ b/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,18 +57,18 @@
     /** Default destructor */
     ~NEElementwiseUnaryKernel() = default;
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEElementwiseUnaryKernel
+    /** Function to configure the @ref NEElementwiseUnaryKernel
      *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input  First tensor input. Data types supported: F16/F32.
-     * @param[in] output Output tensor. Data types supported: Same as @p input.
+     * @param[in]  op     Arithmetic operation to be executed.
+     * @param[in]  input  First tensor input. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
+     * @param[out] output Output tensor. Data types supported: Same as @p input.
      */
     void configure(ElementWiseUnary op, const ITensor *input, ITensor *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEElementwiseUnaryKernel
      *
      * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] input  First tensor input info. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
      * @param[in] output Output tensor info. Data types supported: Same as @p input.
      *
      * @return a Status
@@ -78,23 +78,26 @@
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
+private:
     /** Common signature for all the specialised arithmetic functions
      *
-     * @param[in]  input  An input tensor. Data types supported: F16/F32.
-     * @param[out] output The output tensor. Data types supported: F16/F32.
-     * @param[in]  window Region on which to execute the kernel.
+     * @param[in] window Region on which to execute the kernel.
      */
-    using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output, const Window &window);
+    using ElementwiseUnaryPtr = void (NEElementwiseUnaryKernel::*)(const Window &window);
 
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(ElementWiseUnary op, const ITensorInfo &input, const ITensorInfo &output);
+    /** Template function to run elementwise unary operation
+     *
+     * @tparam ScalarType Scalar datatype
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <typename ScalarType>
+    void elementwise_op(const Window &window);
 
-    /** Function to use for the particular tensor types passed to configure() */
-    std::function<void(const ITensor *input, ITensor *output, const Window &window)> _function;
-
-    const ITensor *_input;
-    ITensor       *_output;
+    ElementwiseUnaryPtr _func;
+    const ITensor      *_input;
+    ITensor            *_output;
+    ElementWiseUnary    _op;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEErodeKernel.h b/arm_compute/core/NEON/kernels/NEErodeKernel.h
index e3fcc28..140481d 100644
--- a/arm_compute/core/NEON/kernels/NEErodeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEErodeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h b/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h
index ed17e3b..f7dc0b1 100644
--- a/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h b/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h
index 6e16fca..15663e7 100644
--- a/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h b/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h
index 72963fa..c25ba32 100644
--- a/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
index c0196c7..e4e87c0 100644
--- a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
index e45caec..99df879 100644
--- a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
index 0c852e8..071843d 100644
--- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
deleted file mode 100644
index 9c1059e..0000000
--- a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
+++ /dev/null

@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H
-#define ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to fill the interior borders */
-class NEFillInnerBorderKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFillInnerBorderKernel";
-    }
-    /** Default constructor */
-    NEFillInnerBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillInnerBorderKernel(const NEFillInnerBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillInnerBorderKernel &operator=(const NEFillInnerBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFillInnerBorderKernel(NEFillInnerBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFillInnerBorderKernel &operator=(NEFillInnerBorderKernel &&) = default;
-    /** Default destructor */
-    ~NEFillInnerBorderKernel() = default;
-
-    /** Initialise the function.
-     *
-     * @note This kernel fills the borders within the XY-planes.
-     *
-     * @param[in,out] input                 Tensor to process. Data types supported: U8/S16/S32/F32.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value = PixelValue());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <typename T>
-    void fill_value_single_channel(const Window &window);
-
-    ITensor   *_tensor;
-    BorderSize _border_size;
-    PixelValue _constant_border_value;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h b/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h
index ba2f998..dbd2412 100644
--- a/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEFloorKernel.h b/arm_compute/core/NEON/kernels/NEFloorKernel.h
index 4cdd9f2..255b0d4 100644
--- a/arm_compute/core/NEON/kernels/NEFloorKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFloorKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
index f598530..ecb17f8 100644
--- a/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
index 6aa8e25..a2f0e8c 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index b6e6bea..7ddbf4b 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
index 8f47c50..856cdf4 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
index b069e4c..5ce8403 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
index 0dc64c9..4db0872 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -115,22 +115,18 @@
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
-    using NEGEMMLowpOffsetContributionOutputStageFunction = std::function<void(const Window, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                                               ITensor *, int32_t, int32_t, int32_t, bool, GEMMLowpOutputStageInfo)>;
-
 private:
     /** Function to use for the particular tensors passed to configure() */
-    NEGEMMLowpOffsetContributionOutputStageFunction _function;
-    const ITensor                                  *_vector_sum_col;
-    const ITensor                                  *_vector_sum_row;
-    const ITensor                                  *_bias;
-    const ITensor                                  *_mm_result;
-    ITensor                                        *_output;
-    int32_t                                         _a_offset;
-    int32_t                                         _b_offset;
-    int32_t                                         _k_offset;
-    bool                                            _slide_vector_sum_col;
-    GEMMLowpOutputStageInfo                         _output_stage;
+    const ITensor          *_vector_sum_col;
+    const ITensor          *_vector_sum_row;
+    const ITensor          *_bias;
+    const ITensor          *_mm_result;
+    ITensor                *_output;
+    int32_t                 _a_offset;
+    int32_t                 _b_offset;
+    int32_t                 _k_offset;
+    bool                    _slide_vector_sum_col;
+    GEMMLowpOutputStageInfo _output_stage;
 };
 } // namespace arm_compute
 

diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
index b4a1419..4e0c8f8 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 0806bd1..d26c778 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index 2b3657c..f166168 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index 2f099a3..94ca617 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 1e472f5..53a542c 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
      * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
      * @param[in]  info   Kernel metadata:
      *                    - k            Number of matrix columns/rows depending on the type of reduction.
@@ -82,7 +82,7 @@
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
      * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
      * @param[in]  info           Kernel metadata:
      *                            - k            (num_mtx_a_cols) Number of matrix A columns
@@ -93,7 +93,7 @@
     void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
      *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
      * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
      * @param[in] info           Kernel metadata:
      *                           - k            (num_mtx_a_cols) Number of matrix A columns
@@ -131,7 +131,7 @@
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
      * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
      * @param[in]  info           Kernel metadata:
      *                            - k            (num_mtx_b_rows) Number of matrix B rows.
@@ -142,7 +142,7 @@
     void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
      *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
      * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
      * @param[in] info           Kernel metadata:
      *                           - k            (num_mtx_b_rows) Number of matrix B rows.

diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
deleted file mode 100644
index a3ba57e..0000000
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
+++ /dev/null

@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
-#define ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-/** NEON kernel to add a bias to each row of the input tensor */
-class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixAccumulateBiasesKernel";
-    }
-    /** Default constructor */
-    NEGEMMMatrixAccumulateBiasesKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixAccumulateBiasesKernel &operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMMatrixAccumulateBiasesKernel() = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: F32
-     * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
-     */
-    void configure(ITensor *accum, const ITensor *biases);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAccumulateBiasesKernel
-     *
-     * @param[in] accum  The accumulate tensor to convert. Data type supported: F32
-     * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *accum, const ITensorInfo *biases);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    ITensor       *_accum;
-    const ITensor *_biases;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
index e528c59..79f6256 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
index 841e08d..f79e07e 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
deleted file mode 100644
index f5635dd..0000000
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ /dev/null

@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_
-#define ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the GEMM matrix vector multiply kernel. **/
-class NEGEMMMatrixVectorMultiplyKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixVectorMultiplyKernel";
-    }
-    /** Default constructor */
-    NEGEMMMatrixVectorMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixVectorMultiplyKernel(const NEGEMMMatrixVectorMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixVectorMultiplyKernel &operator=(const NEGEMMMatrixVectorMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixVectorMultiplyKernel(NEGEMMMatrixVectorMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in]  input1 Second Input tensor. Data types supported: same as @p input.
-     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input.
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixVectorMultiplyKernel
-     *
-     * @param[in] input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] input1 Second Input tensor. Data types supported: same as @p input.
-     * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Template function to run the matrix vector multiplication
-     *
-     * @tparam I0 Input 0 type
-     * @tparam I1 Input 1 type
-     * @tparam O  Output type
-     *
-     * @param[in] window_in  Input region. (Must be a valid region of the window returned by window()).
-     * @param[in] window_w   Weights region. (Must be a valid region of the window returned by window()).
-     * @param[in] window_out Output region.(Must be a valid region of the window returned by window()).
-     */
-    template <typename I0, typename I1, typename O>
-    void matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out);
-    /** Common signature for all the specialised matrix vector multiplication functions */
-    using GEMMMatrixVectorMultiplyFunctionPtr = void (NEGEMMMatrixVectorMultiplyKernel::*)(const Window &window_in,
-                                                                                           const Window &window_w,
-                                                                                           const Window &window_out);
-
-private:
-    GEMMMatrixVectorMultiplyFunctionPtr _func;
-    const ITensor                      *_input0;
-    const ITensor                      *_input1;
-    ITensor                            *_output;
-    BorderSize                          _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_*/

diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index 967a1b7..756ac6a 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGatherKernel.h b/arm_compute/core/NEON/kernels/NEGatherKernel.h
index bfef40b..31d4f19 100644
--- a/arm_compute/core/NEON/kernels/NEGatherKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGatherKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
index fa92eef..c814181 100644
--- a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@
     /** Set the source, destination and border mode of the kernel
      *
      * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: S16
+     * @param[out] output           Destination tensor. Data type supported: same as @p input
      * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
      */
     void configure(const ITensor *input, ITensor *output, bool border_undefined);

diff --git a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
index 5e63e51..b489f4b 100644
--- a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
index 4700325..33a4452 100644
--- a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index 382ce54..7b82488 100644
--- a/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
index edb2da5..b0206ec 100644
--- a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
index acb3592..2c23a2b 100644
--- a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
index a77fe16..084dd7d 100644
--- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
index be81f2e..8a5e86a 100644
--- a/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,12 +57,12 @@
     ~NEHeightConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input         Input tensor. Data types supported: All
+     * @param[in]     input         Input tensor info. Data types supported: All
      * @param[in]     height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in,out] output        Output tensor. Data types supported: Same as @p input.
+     * @param[in,out] output        Output tensor info. Data types supported: Same as @p input.
      *
      */
-    void configure(const ITensor *input, unsigned int height_offset, ITensor *output);
+    void configure(const ITensorInfo *input, unsigned int height_offset, ITensorInfo *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEHeightConcatenateLayerKernel
      *
      * @param[in] input         Input tensor info. Data types supported: All
@@ -74,12 +74,10 @@
     static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _height_offset;
+    unsigned int _height_offset;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
index b1dd105..6e5b922 100644
--- a/arm_compute/core/NEON/kernels/NEHistogramKernel.h
+++ b/arm_compute/core/NEON/kernels/NEHistogramKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index 1c358b3..95825ad 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,7 +79,7 @@
      * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
      *                         while every optional dimension from 4 and above represent a batch of inputs.
      *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                         Note: QASYMM8 works only for has_bias = false
+     *                         Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
      * @param[out] output      The output tensor. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
      * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
@@ -94,7 +94,7 @@
      * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
      *                        while every optional dimension from 4 and above represent a batch of inputs.
      *                        Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                        Note: QASYMM8 works only for has_bias = false
+     *                        Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
      * @param[in] output      The output tensor. Data types supported: Same as @p input
      * @param[in] kernel_dims The kernel dimensions (width and height).
      * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.

diff --git a/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index 7c14e40..a5bd453 100644
--- a/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 namespace arm_compute
 {
 class ITensor;
+struct InstanceNormalizationLayerKernelInfo;
 
 /** Interface for performing an instance normalization */
 class NEInstanceNormalizationLayerKernel : public INEKernel
@@ -52,26 +53,22 @@
     ~NEInstanceNormalizationLayerKernel() = default;
     /** Set the input and output tensors.
      *
-     * @param[in, out] input   Source tensor. Data types supported: F16/F32. Data layout supported: NCHW
-     *                         In case of @p output tensor = nullptr this tensor will store the result of the normalization.
-     * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0
-     * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0
-     * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+     * @param[in, out] input  Source tensor. Data types supported: F16/F32. Data layout supported: NCHW
+     *                        In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+     * @param[out]     output Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]      info   Kernel meta-data descriptor
      */
-    void configure(ITensor *input, ITensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+    void configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEInstanceNormalizationLayer.
      *
-     * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported: NCHW
-     * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p input.
-     * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0
-     * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0
-     * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+     * @param[in] input  Source tensor info. Data types supported: F16/F32. Data layout supported: NCHW
+     * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
+     * @param[in] info   Kernel meta-data descriptor
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -93,6 +90,7 @@
     float                  _gamma;
     float                  _beta;
     float                  _epsilon;
+    bool                   _use_mixed_precision{ true };
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
index 77ae7b9..57f24be 100644
--- a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
index 3937bf0..302d04e 100644
--- a/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
index cf99bbe..90e5f41 100644
--- a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
+++ b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
index ad2a161..ba14598 100644
--- a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
index 7ad5bf0..ea42a38 100644
--- a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h
new file mode 100644
index 0000000..f3ea049
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h

@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H
+#define ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for the pooling layer kernel */
+class NEMaxUnpoolingLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEMaxUnpoolingLayerKernel";
+    }
+    /** Default constructor */
+    NEMaxUnpoolingLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMaxUnpoolingLayerKernel(const NEMaxUnpoolingLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMaxUnpoolingLayerKernel &operator=(const NEMaxUnpoolingLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEMaxUnpoolingLayerKernel(NEMaxUnpoolingLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEMaxUnpoolingLayerKernel &operator=(NEMaxUnpoolingLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEMaxUnpoolingLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @note Output shape must be equal to the shape of the original input to pool.
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices   Tensor containing the offset to store the input elements in the output tensor.
+     *                       @ref NEPoolingLayerKernel with indices should precede this function in order to
+     *                       properly reconstruct the output tensor.
+     *                       The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const ITensor *input, const ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEMaxUnpoolingLayerKernel
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] indices   Tensor info of the indices of the maximal values. Data type supported: U32.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     */
+    template <typename T>
+    void unpooling2(const Window &window_input);
+
+    using UnpoolingFunction = void (NEMaxUnpoolingLayerKernel::*)(const Window &window);
+
+private:
+    UnpoolingFunction _func;
+    const ITensor    *_input;
+    ITensor          *_output;
+    const ITensor    *_indices;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEMAXUNPOOLINGLAYERKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
index 2197e3c..eef0e2b 100644
--- a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h b/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
index dc0455c..66b9075 100644
--- a/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
index 3e86860..f2871e2 100644
--- a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEMemsetKernel.h b/arm_compute/core/NEON/kernels/NEMemsetKernel.h
index b4bcd11..f9a1914 100644
--- a/arm_compute/core/NEON/kernels/NEMemsetKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMemsetKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h
index 445e12a..e7e87e9 100644
--- a/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
index 597a093..83f5afc 100644
--- a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
index 43594ba..5fc225c 100644
--- a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
+++ b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
index e2ddec9..bf5c520 100644
--- a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index 4727164..665b102 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,7 +76,6 @@
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
 
 private:
     /** Function to perform normalization depending on the given template
@@ -104,7 +103,6 @@
     const ITensor         *_input_squared;
     ITensor               *_output;
     NormalizationLayerInfo _norm_info;
-    BorderSize             _border_size;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEPadLayerKernel.h b/arm_compute/core/NEON/kernels/NEPadLayerKernel.h
index 4cbefbd..80daabb 100644
--- a/arm_compute/core/NEON/kernels/NEPadLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPadLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEPermuteKernel.h b/arm_compute/core/NEON/kernels/NEPermuteKernel.h
index 89dc4e6..2f8af93 100644
--- a/arm_compute/core/NEON/kernels/NEPermuteKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPermuteKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index 1a9dd6b..c530d78 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,42 +53,54 @@
     ~NEPixelWiseMultiplicationKernel() = default;
     /** Initialise the kernel's input, output and border mode.
      *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in]  input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[out] output          Output tensor. Data types supported:
-     *                             - U8, only if both inputs are U8.
-     *                             - QASYMM8, only if both inputs are QASYMM8.
-     *                             - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
-     *                             - S16.
-     *                             - QSYMM16, only if both inputs are QSYMM16.
-     *                             - S32, only if both inputs are QSYMM16.
-     *                             - F16, only if @p input1 is F16.
-     *                             - F32, only if both inputs are F32.
+     * @param[in]  input1          First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in]  input2          Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[out] output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
      * @param[in]  rounding_policy Rounding policy.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel
      *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[in] output          Output tensor info. Data types supported:
-     *                            - U8, only if both inputs are U8.
-     *                            - QASYMM8, only if both inputs are QASYMM8.
-     *                            - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
-     *                            - S16.
-     *                            - QSYMM16, only if both inputs are QSYMM16.
-     *                            - S32, only if both inputs are QSYMM16.
-     *                            - F16, only if @p input1 is F16.
-     *                            - F32, only if both inputs are F32.
+     * @param[in] input1          First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] input2          Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] output          Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
@@ -98,52 +110,46 @@
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
 
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
+    // Inherited methods overridden
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
     /** Common signature for all the specialised multiplication functions with integer scaling factor
      *
-     * @param[in]  input1_ptr Pointer to the first input tensor.
-     * @param[in]  input2_ptr Pointer to the second input tensor.
-     * @param[out] output_ptr Pointer to the output tensor.
-     * @param[in]  scale      Integer scale factor.
+     * @param[in]  in1    Input1 tensor object.
+     * @param[in]  in2    Input2 tensor object.
+     * @param[out] out    Output tensor object.
+     * @param[in]  window Region on which to execute the kernel
+     * @param[in]  scale  Integer scale factor.
      */
-    using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
+    using MulFunctionInt = void(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int scale);
     /** Common signature for all the specialised multiplication functions with float scaling factor
      *
-     * @param[in]  input1_ptr Pointer to the first input tensor.
-     * @param[in]  input2_ptr Pointer to the second input tensor.
-     * @param[out] output_ptr Pointer to the output tensor.
-     * @param[in]  scale      Float scale factor.
+     * @param[in]  in1    Input1 tensor object.
+     * @param[in]  in2    Input2 tensor object.
+     * @param[out] out    Output tensor object.
+     * @param[in]  window Region on which to execute the kernel
+     * @param[in]  scale  Float scale factor.
      */
-    using MulFunctionFloat = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale);
+    using MulFunctionFloat = void(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale);
     /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
      *
-     * @param[in]  input1_ptr      Pointer to the first input tensor.
-     * @param[in]  input2_ptr      Pointer to the second input tensor.
-     * @param[out] output_ptr      Pointer to the output tensor.
-     * @param[in]  scale           Float scale factor.
-     * @param[in]  input1_qua_info Quantization Info of tensor input1.
-     * @param[in]  input2_qua_info Quantization Info of tensor input2.
-     * @param[in]  output_qua_info Quantization Info of tensor output.
+     * @param[in]  in1    Input1 tensor object.
+     * @param[in]  in2    Input2 tensor object.
+     * @param[out] out    Output tensor object.
+     * @param[in]  window Region on which to execute the kernel
+     * @param[in]  scale  Float scale factor.
      *
      */
-    using MulFunctionQuantized = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale,
-                                      const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info, const UniformQuantizationInfo &output_qua_info);
+    using MulFunctionQuantized = void(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale);
 
     MulFunctionFloat     *_func_float;
     MulFunctionInt       *_func_int;
     MulFunctionQuantized *_func_quantized;
 
 private:
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    float          _scale;
-    int            _scale_exponent;
-    bool           _run_optimized_qasymm8;
+    float _scale;
+    int   _scale_exponent;
 };
 
 /** Interface for the complex pixelwise multiplication kernel. */
@@ -154,23 +160,13 @@
     {
         return "NEComplexPixelWiseMultiplicationKernel";
     }
-    /** Default constructor.*/
-    NEComplexPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComplexPixelWiseMultiplicationKernel(const NEComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComplexPixelWiseMultiplicationKernel &operator=(const NEComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEComplexPixelWiseMultiplicationKernel(NEComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEComplexPixelWiseMultiplicationKernel &operator=(NEComplexPixelWiseMultiplicationKernel &&) = default;
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]  input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
      * @param[in]  input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[out] output The output tensor, Data types supported: same as @p input1.  Number of channels supported: same as @p input1.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplicationKernel
      *
      * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -182,13 +178,7 @@
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 };
 
 } // namespace arm_compute

diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index b0574b7..2be2508 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,12 +91,6 @@
      * @param[in] window_input Input region on which to execute the kernel.
      * @param[in] window       Output region on which to execute the kernel.
      */
-    void pooling2_f32_nchw_maxpool_indices(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
     void pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window);
     /** Function to perform MxN pooling for 32-bit floating point values.
      *
@@ -138,6 +132,19 @@
      * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
      */
     void pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
+    /** Function to perform 2x2 pooling and compute the pooling indices for FP32/FP16. The indices can be used for max unpool.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    template <typename T>
+    void pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window);
+    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
+     *
+     * @param[in] window_input Input region on which to execute the kernel.
+     * @param[in] window       Output region on which to execute the kernel.
+     */
+    void pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling.
      *
      * @param[in] window_input    Input region on which to execute the kernel.

diff --git a/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h b/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h
index 6bf6574..84db991 100644
--- a/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
index f5e8da7..86c9e1d 100644
--- a/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
index 087e767..d35e027 100644
--- a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h b/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h
index bebcab5..66ebb5e 100644
--- a/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
index 59a5017..fa9685b 100644
--- a/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NERangeKernel.h b/arm_compute/core/NEON/kernels/NERangeKernel.h
index e67a5dc..84ebd53 100644
--- a/arm_compute/core/NEON/kernels/NERangeKernel.h
+++ b/arm_compute/core/NEON/kernels/NERangeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h b/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
index 28cca49..180697f 100644
--- a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,7 +59,7 @@
 
     /** Set the source, destination of the kernel
      *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW.
+     * @param[in]  input  Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
      * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
      *                    Output will have the same number of dimensions as input.
      * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
@@ -69,7 +69,7 @@
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperationKernel.
      *
-     * @param[in] input  Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW.
+     * @param[in] input  Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
      * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
      *                   Output will have the same number of dimensions as input.
      * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0

diff --git a/arm_compute/core/NEON/kernels/NERemapKernel.h b/arm_compute/core/NEON/kernels/NERemapKernel.h
index e929b1c..34c80a3 100644
--- a/arm_compute/core/NEON/kernels/NERemapKernel.h
+++ b/arm_compute/core/NEON/kernels/NERemapKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h b/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
index 9277ddb..d751a6b 100644
--- a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
index fccf268..a4b8426 100644
--- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_NERESHAPELAYERKERNEL_H
 #define ARM_COMPUTE_NERESHAPELAYERKERNEL_H
 
+#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/NEON/INESimpleKernel.h"
 
 namespace arm_compute
@@ -32,19 +33,19 @@
 class ITensor;
 
 /** Interface for the kernel to perform tensor reshaping */
-class NEReshapeLayerKernel : public INESimpleKernel
+class NEReshapeLayerKernel : public INEKernel
 {
 public:
     const char *name() const override
     {
         return "NEReshapeLayerKernel";
     }
-    /** Set the input and output of the kernel
+    /** Set the input and output info of the kernel
      *
-     * @param[in]  input  Source tensor. Data type supported: All
-     * @param[out] output Destination tensor. Data type supported: Same as @p input
+     * @param[in]  input  Source tensor info. Data type supported: All
+     * @param[out] output Destination tensor info. Data type supported: Same as @p input
      */
-    void configure(const ITensor *input, ITensor *output);
+    void configure(const ITensorInfo *input, ITensorInfo *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayerKernel
      *
@@ -56,7 +57,7 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NERESHAPELAYERKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEReverseKernel.h b/arm_compute/core/NEON/kernels/NEReverseKernel.h
index 516653b..fda7915 100644
--- a/arm_compute/core/NEON/kernels/NEReverseKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReverseKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
index 0d0d457..a2328b1 100644
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_NESCALEKERNEL_H
 #define ARM_COMPUTE_NESCALEKERNEL_H
 
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
@@ -57,41 +57,29 @@
      * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
      * @note Using @p policy Area only supports data layout NCHW and input data type U8.
      *
-     * @param[in]  input                 Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
-     * @param[in]  dx                    Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
-     * @param[in]  dy                    Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
-     * @param[in]  offsets               Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[out] output                Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  policy                Interpolation type to use
-     * @param[in]  border_mode           Border mode policy
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT and use_padding is set to false.
-     * @param[in]  sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]  use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in]  align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     * @param[in]  input   Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
+     * @param[in]  dx      Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
+     * @param[in]  dy      Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
+     * @param[in]  offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
+     * @param[out] output  Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  info    @ref ScaleKernelInfo to use for configuration
      */
     void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
-                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
+                   const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEScaleKernel
      *
      * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
      * @note Using @p policy Area only supports data layout NCHW and input data type U8.
      *
-     * @param[in] input                 Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
-     * @param[in] dx                    Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
-     * @param[in] dy                    Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
-     * @param[in] offsets               Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[in] output                Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] policy                Interpolation type to use
-     * @param[in] border_mode           Border mode policy
-     * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT and use_padding is set to false.
-     * @param[in] sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in] use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in] align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     * @param[in] input   Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
+     * @param[in] dx      Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
+     * @param[in] dy      Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
+     * @param[in] offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
+     * @param[in] output  Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in] info    @ref ScaleKernelInfo to use for validation
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *output,
-                           InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
-                           SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
+                           const ScaleKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
index 320b44d..7e1fdb5 100644
--- a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NESelectKernel.h b/arm_compute/core/NEON/kernels/NESelectKernel.h
index 51c8543..bb8695f 100644
--- a/arm_compute/core/NEON/kernels/NESelectKernel.h
+++ b/arm_compute/core/NEON/kernels/NESelectKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
index ef0db2a..66a13c4 100644
--- a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
index bc0cfb0..02029b6 100644
--- a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
+++ b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
index 468a94d..0e8b82c 100644
--- a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
+++ b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
index 0e0be79..e80cd22 100644
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h
index 532fbb2..b5d7c69 100644
--- a/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
index e0c22e6..11443e0 100644
--- a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h b/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
index c4dc53e..710a6be 100644
--- a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEStackLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h b/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h
index 6709619..be55fd7 100644
--- a/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h
+++ b/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,8 +58,8 @@
      *
      * @note Supported tensor rank: up to 4
      *
-     * @param[in]  input            Source tensor. Data type supported: All
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
+     * @param[in]  input            Source tensor info. Data type supported: All
+     * @param[out] output           Destination tensor info. Data type supported: Same as @p input
      * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
@@ -68,7 +68,7 @@
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensor *input, ITensor *output,
+    void configure(const ITensorInfo *input, ITensorInfo *output,
                    const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                    int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
 
@@ -91,14 +91,12 @@
                            int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;         /**< Source tensor */
-    ITensor       *_output;        /**< Destination tensor */
-    Coordinates    _starts_abs;    /**< Absolute start coordinates */
-    Coordinates    _final_strides; /**< Final strides */
-    int32_t        _shrink_mask;   /**< Shrink axis mask */
+    Coordinates _starts_abs;    /**< Absolute start coordinates */
+    Coordinates _final_strides; /**< Final strides */
+    int32_t     _shrink_mask;   /**< Shrink axis mask */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NETableLookupKernel.h b/arm_compute/core/NEON/kernels/NETableLookupKernel.h
index 13a76cb..58bfdbe 100644
--- a/arm_compute/core/NEON/kernels/NETableLookupKernel.h
+++ b/arm_compute/core/NEON/kernels/NETableLookupKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEThresholdKernel.h b/arm_compute/core/NEON/kernels/NEThresholdKernel.h
index a6d1e90..daad47d 100644
--- a/arm_compute/core/NEON/kernels/NEThresholdKernel.h
+++ b/arm_compute/core/NEON/kernels/NEThresholdKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,18 +24,15 @@
 #ifndef ARM_COMPUTE_NETHRESHOLDKERNEL_H
 #define ARM_COMPUTE_NETHRESHOLDKERNEL_H
 
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
-#include <cstdint>
-
 namespace arm_compute
 {
 class ITensor;
 
-/** Interface for the thresholding kernel
- *
- */
+/** Interface for the thresholding kernel */
 class NEThresholdKernel : public INEKernel
 {
 public:
@@ -53,15 +50,20 @@
     NEThresholdKernel &operator=(const NEThresholdKernel &) = delete;
     /** Initialise the kernel's input, output and threshold parameters.
      *
-     * @param[in]  input       An input tensor. Data type supported: U8
-     * @param[out] output      The output tensor. Data type supported: U8.
-     * @param[in]  threshold   Threshold. When the threhold type is RANGE, this is used as the lower threshold.
-     * @param[in]  false_value value to set when the condition is not respected.
-     * @param[in]  true_value  value to set when the condition is respected.
-     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
-     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
+     * @param[in]  input  An input tensor. Data type supported: U8
+     * @param[out] output The output tensor. Data type supported: U8.
+     * @param[in]  info   Threshold kernel descriptor
      */
-    void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
+    void configure(const ITensor *input, ITensor *output, const ThresholdKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEThresholdKernel
+     *
+     * @param[in] input  Input tensor info. Data type supported: U8
+     * @param[in] output Output tensor info. Data type supported: U8
+     * @param[in] info   Threshold kernel descriptor
+     *
+     * @return A status containing an error code in case of failure
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ThresholdKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -74,12 +76,9 @@
 
     void (NEThresholdKernel::*_func)(const Window &window);
 
-    const ITensor *_input;  /**< Input */
-    ITensor       *_output; /**< Output */
-    uint8_t        _threshold;
-    uint8_t        _false_value;
-    uint8_t        _true_value;
-    uint8_t        _upper;
+    const ITensor      *_input;  /**< Input */
+    ITensor            *_output; /**< Output */
+    ThresholdKernelInfo _info;   /**< Threshold descriptor */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NETileKernel.h b/arm_compute/core/NEON/kernels/NETileKernel.h
index a64470f..7a3039a 100644
--- a/arm_compute/core/NEON/kernels/NETileKernel.h
+++ b/arm_compute/core/NEON/kernels/NETileKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
index a14dece..1507a1c 100644
--- a/arm_compute/core/NEON/kernels/NETransposeKernel.h
+++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h b/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
index 1ea3f97..a1278ea 100644
--- a/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@
     ~NEUpsampleLayerKernel() = default;
     /** Set the input output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
      * @param[in]  info   Contains stride information described in @ref Size2D.
      * @param[in]  policy Defines the policy to fill the intermediate pixels.
@@ -61,7 +61,7 @@
     void configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEUpsampleLayerKernel
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output Destination tensor info. Data types supported: same as @p input.
      * @param[in] info   Contains stride information described in @ref Size2D.
      * @param[in] policy Defines the policy to fill the intermediate pixels.
@@ -94,7 +94,6 @@
     const ITensor      *_input;
     ITensor            *_output;
     Size2D              _info;
-    unsigned int        _num_elems_processed_per_iteration_x;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEWarpKernel.h b/arm_compute/core/NEON/kernels/NEWarpKernel.h
index 61ca21e..21fc7b2 100644
--- a/arm_compute/core/NEON/kernels/NEWarpKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWarpKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
index b68cb50..8cb3ed8 100644
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,7 +76,7 @@
      *
      * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
      *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
-     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/FP16/F32
+     *                    Data types supported: All
      * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
@@ -87,7 +87,7 @@
      *
      * @param[in] input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
      *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared.
-     *                   Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32
+     *                   Data types supported: All
      * @param[in] biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                   dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      *                   @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.

diff --git a/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
index f22f18f..64d741d 100644
--- a/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,12 +57,11 @@
     ~NEWidthConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: All
+     * @param[in]     input        Input tensor info. Data types supported: All
      * @param[in]     width_offset The offset on the X axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
+     * @param[in,out] output       Output tensor info. Data types supported: Same as @p input.
      */
-    void configure(const ITensor *input, unsigned int width_offset, ITensor *output);
+    void configure(const ITensorInfo *input, unsigned int width_offset, ITensorInfo *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEWidthConcatenateLayerKernel
      *
      * @param[in] input        Input tensor info. Data types supported: All
@@ -74,12 +73,10 @@
     static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _width_offset;
+    unsigned int _width_offset;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H */

diff --git a/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h b/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h
index 0fd3f8c..8795e4a 100644
--- a/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp b/arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp
deleted file mode 100644
index 4ff83fb..0000000
--- a/arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp
+++ /dev/null

@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <array>
-#include <algorithm>
-#include <initializer_list>
-
-#include <cassert>
-
-namespace arm_gemm {
-
-template<unsigned int D>
-class NDRange {
-private:
-    std::array<unsigned int, D> m_sizes {};
-    std::array<unsigned int, D> m_totalsizes {};
-
-    class NDRangeIterator {
-    private:
-        const NDRange &m_parent;
-        unsigned int m_pos = 0;
-        unsigned int m_end = 0;
-
-    public:
-        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { }
-
-        bool done() const {
-            return (m_pos >= m_end);
-        }
-
-        unsigned int dim(unsigned int d) const {
-            unsigned int r = m_pos;
-
-            if (d < (D - 1)) {
-                r %= m_parent.m_totalsizes[d];
-            }
-
-            if (d > 0) {
-                r /= m_parent.m_totalsizes[d-1];
-            }
-
-            return r;
-        }
-
-        bool next_dim0() {
-            m_pos++;
-
-            return !done();
-        }
-
-        bool next_dim1() {
-            m_pos += m_parent.m_sizes[0] - dim(0);
-
-            return !done();
-        }
-
-        unsigned int dim0_max() const {
-            unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
-
-            return dim(0) + offset;
-        }
-    };
-
-public:
-    NDRange& operator=(const NDRange& rhs)=default;
-    NDRange(const NDRange& rhs)           =default;
-
-    template <typename... T>
-    NDRange(T... ts)
-    : m_sizes{ts...}
-    {
-        unsigned int t=1;
-
-        for (unsigned int i=0; i<D; i++) {
-            t *= m_sizes[i];
-
-            m_totalsizes[i] = t;
-        }
-    }
-
-    NDRange(const std::array<unsigned int, D>& n)
-    : m_sizes(n)
-    {
-        unsigned int t=1;
-
-        for (unsigned int i=0; i<D; i++) {
-            t *= m_sizes[i];
-
-            m_totalsizes[i] = t;
-        }
-    }
-
-    NDRangeIterator iterator(unsigned int start, unsigned int end) const {
-        return NDRangeIterator(*this, start, end);
-    }
-
-    unsigned int total_size() const {
-        return m_totalsizes[D - 1];
-    }
-
-    unsigned int get_size(unsigned int v) const {
-        return m_sizes[v];
-    }
-};
-
-/** NDCoordinate builds upon a range, but specifies a starting position
- * in addition to a size which it inherits from NDRange
- */
-template<unsigned int N>
-class NDCoordinate : public NDRange<N> {
-    using int_t     =unsigned int;
-    using ndrange_t = NDRange<N>;
-
-    std::array<int_t, N> m_positions {};
-public:
-    NDCoordinate& operator=(const NDCoordinate& rhs)=default;
-    NDCoordinate(const NDCoordinate& rhs)           =default;
-    NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>>& list)
-    {
-        std::array<int_t, N> sizes{};
-
-        std::size_t i = 0;
-        for(auto& p : list) {
-            m_positions[i]= p.first;
-            sizes[i++]    = p.second;
-        }
-
-        //update the parents sizes
-        static_cast<ndrange_t&>(*this) = ndrange_t(sizes);
-    }
-
-    int_t get_position(int_t d) const {
-        assert(d < m_positions.size());
-        return m_positions[d];
-    }
-
-    void set_position(int_t d, int_t v) {
-        assert(d < size(m_positions));
-        assert(v < ndrange_t::get_size(d));
-
-        m_positions[d] = v;
-    }
-
-    int_t get_position_end(int_t d) const {
-        return get_position(d) + NDRange<N>::get_size(d);
-    }
-}; //class NDCoordinate
-
-/** @returns the number of dimensions in the NDRange which have none-1 values
- * IE there is actual work in these dimensions that can be broken up
- */
-template<unsigned int N>
-std::size_t ndrange_popcount(const NDRange<N>& ndr) {
-    std::size_t count = 0;
-
-    for(unsigned int d = 0; d != N; ++d) {
-        if(ndr.get_size(d) != 1)
-            ++count;
-    }
-    return count;
-}
-
-} // namespace arm_gemm

diff --git a/arm_compute/core/NEON/kernels/assembly/Helpers.h b/arm_compute/core/NEON/kernels/assembly/Helpers.h
deleted file mode 100644
index 9372e05..0000000
--- a/arm_compute/core/NEON/kernels/assembly/Helpers.h
+++ /dev/null

@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_HELPERS_H
-#define ARM_COMPUTE_ASSEMBLY_HELPERS_H
-
-#include "arm_compute/core/CPP/CPPTypes.h"
-#include "arm_compute/core/Utils.h"
-
-#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
-
-namespace arm_compute
-{
-/** Block sizes to use to break the M, N, K dimension */
-struct BlockSizes
-{
-    unsigned int k_block{ 0 };             /**< Block size alon the K dimension */
-    unsigned int x_block{ 0 };             /**< Block size along the N (x) dimension */
-    unsigned int m_round{ 0 };             /**< Block size along the M dimension (Must be a multiple of strategy_out_height) */
-    unsigned int strategy_out_height{ 0 }; /**< Number of rows (M) processed by the selected strategy */
-};
-
-/** Extracts the kernel description of the selected kernel by the GEMM backend heuristics
- *
- * @param[in] input_type        Data type of the input tensor.
- * @param[in] ci                CPU information.
- * @param[in] num_threads       Maximum number of threads that might be used for the calculations.
- * @param[in] p                 M, N, K sizes.
- * @param[in] activation        Activation struct
- * @param[in] pretranspose_hint Is B also pretransposed ?
- *
- * @return Kernel description that the assembly heuristics picked for the given configuration
- */
-arm_gemm::KernelDescription get_gemm_info(DataType                            input_type,
-                                          const CPUInfo                      &ci,
-                                          const unsigned int                  num_threads,
-                                          const INEGEMMWrapperKernel::Params &p,
-                                          arm_gemm::Activation                activation,
-                                          bool                                pretranspose_hint);
-
-/** Calculate the recommended block sizes to use based on the CPU cache sizes and the strategy which will be used
- *
- * @param[in] ci CPU information.
- * @param[in] M  M dimension.
- * @param[in] N  N dimension.
- * @param[in] K  K dimension.
- *
- * @return Recommeded block sizes to use for the given M, N, K dimensions.
- */
-template <typename strategy>
-BlockSizes calculate_block_sizes(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K)
-{
-    BlockSizes bs;
-
-    using Toi = typename strategy::operand_type;
-
-    const unsigned int L1_size = ci.get_L1_cache_size();
-    const unsigned int L2_size = ci.get_L2_cache_size();
-
-    // Work out blocking parameters
-
-    // k_block: Find out how much of the larger array can be loaded into half the cache.
-    // This should account for associative caches.
-    bs.k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
-    // Needs to be (at least a single) multiple of the K unroll level.
-    bs.k_block /= strategy::k_unroll();
-    bs.k_block = std::max(bs.k_block, 1U) * strategy::k_unroll();
-
-    // Now tune to presented problem size; this is how many blocks we need.
-    int num_k_blocks = DIV_CEIL(K, bs.k_block);
-
-    // So divide the space equally into that many blocks.
-    bs.k_block = DIV_CEIL(K, num_k_blocks);
-
-    // And round UP to the K unroll level required.
-    bs.k_block = ceil_to_multiple(bs.k_block, strategy::k_unroll());
-
-    // x_block: Work out how many rows (of length k_block) will fit in the L2
-    // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-    bs.x_block = (((L2_size * 9) / 10) - (bs.k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / (sizeof(Toi) * bs.k_block);
-
-    // Needs to be (at least a single) multiple of the kernel output width.
-    bs.x_block /= strategy::out_width();
-    bs.x_block = std::max(bs.x_block, 1U) * strategy::out_width();
-
-    // And tune to the presented problem size.
-    int num_x_blocks = DIV_CEIL(N, bs.x_block);
-    bs.x_block       = DIV_CEIL(N, num_x_blocks);
-
-    bs.x_block = ceil_to_multiple(bs.x_block, strategy::out_width());
-
-    // Work out the rounded size of M - needed for some buffers.
-    bs.m_round             = ceil_to_multiple(M, strategy::out_height());
-    bs.strategy_out_height = strategy::out_height();
-
-    return bs;
-}
-
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_HELPERS_H */

diff --git a/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
index f152ab5..74161e3 100644
--- a/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
+++ b/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
index 8a9fb82..7c10f85 100644
--- a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
+++ b/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
deleted file mode 100644
index 7723224..0000000
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
+++ /dev/null

@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <memory>
-#include <cstring>
-
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
-namespace arm_gemm {
-
-enum class GemmMethod
-{
-    DEFAULT,
-    GEMV_BATCHED,
-    GEMV_PRETRANSPOSED,
-    GEMV_NATIVE_TRANSPOSED,
-    GEMM_NATIVE,
-    GEMM_HYBRID,
-    GEMM_INTERLEAVED,
-    GEMM_INTERLEAVED_2D,
-    QUANTIZE_WRAPPER,
-    GEMM_HYBRID_QUANTIZED
-};
-
-struct KernelDescription
-{
-    GemmMethod   method      = GemmMethod::DEFAULT;
-    std::string  name        = "";
-    bool         is_default  = false;
-
-    KernelDescription(GemmMethod m, std::string n, bool d=false) : method(m), name(n), is_default(d) { }
-    KernelDescription() noexcept  { }
-};
-
-struct GemmConfig
-{
-    GemmMethod   method           = GemmMethod::DEFAULT;
-    std::string  filter           = "";
-    unsigned int inner_block_size = 0;
-    unsigned int outer_block_size = 0;
-
-    GemmConfig(GemmMethod method) : method(method) { }
-    GemmConfig() { }
-};
-
-struct Activation
-{
-    enum class Type {
-        None,
-        ReLU,
-        BoundedReLU
-    };
-
-    Type    type;
-    float   param1;
-    float   param2;
-
-    Activation(Type type=Type::None, float p1=0.0f, float p2=0.0f) : type(type), param1(p1), param2(p2) { }
-};
-
-struct GemmArgs
-{
-public:
-    const CPUInfo    *_ci;
-    unsigned int      _Msize;
-    unsigned int      _Nsize;
-    unsigned int      _Ksize;
-    unsigned int      _nbatches;
-    unsigned int      _nmulti;
-    bool              _trA;
-    bool              _trB;
-    Activation        _act;
-    int               _maxthreads;
-    bool              _pretransposed_hint;
-    const GemmConfig *_cfg;
-
-    GemmArgs(const CPUInfo *ci, const unsigned int M, const unsigned int N,
-             const unsigned int K, const unsigned int nbatches,
-             const unsigned int nmulti, const bool trA, const bool trB,
-             Activation act, const int maxthreads,
-             const bool pretransposed_hint, const GemmConfig *cfg=nullptr ) :
-             _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti),
-             _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads),
-             _pretransposed_hint(pretransposed_hint), _cfg(cfg)
-    {
-    }
-};
-
-struct Requantize32
-{
-public:
-    const int32_t  *bias = nullptr;
-    size_t          bias_multi_stride = 0;
-    int32_t         a_offset = 0;
-    int32_t         b_offset = 0;
-    int32_t         c_offset = 0;
-    bool            per_channel_requant = false;
-    int32_t         per_layer_shift = 0;
-    int32_t         per_layer_mul = 0;
-    const int32_t  *per_channel_shifts = nullptr;
-    const int32_t  *per_channel_muls = nullptr;
-    int32_t         minval = 0;
-    int32_t         maxval = 0;
-
-    Requantize32() = default;
-
-    // Constructor for per-tensor quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 int32_t requant_shift, int32_t requant_mul,
-                 int32_t minv, int32_t maxv) :
-        bias(bias), bias_multi_stride(bias_multi_stride),
-        a_offset(a_offset), b_offset(b_offset), c_offset(c_offset),
-        per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul),
-        minval(minv), maxval(maxv)
-    {
-    }
-
-    // Constructor for per-channel quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 const int32_t *requant_shifts, const int32_t *requant_muls,
-                 int32_t minv, int32_t maxv) :
-        bias(bias), bias_multi_stride(bias_multi_stride),
-        a_offset(a_offset), b_offset(b_offset), c_offset(c_offset),
-        per_channel_requant(true), per_channel_shifts(requant_shifts), per_channel_muls(requant_muls),
-        minval(minv), maxval(maxv)
-    {
-    }
-};
-
-struct Nothing
-{
-};
-
-template<typename Top, typename Tret>
-using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >;
-
-/* Low level API calls.
- * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
-
-/* get_gemm_method(): Given the templated types and provided parameters,
- * which is the preferred method to implement this GEMM?  */
-template<typename Top, typename Tret, class OutputStage = Nothing>
-KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & ={});
-
-template<typename Top, typename Tret, class OutputStage = Nothing>
-UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & ={});
-
-template<typename Top, typename Tret, class OutputStage = Nothing>
-std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & ={});
-
-} // namespace arm_gemm

diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
index 8d3db4a..de92cce 100644
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/activation.hpp b/arm_compute/core/NEON/kernels/convolution/common/activation.hpp
index 091b165..0c9b7c1 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/activation.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/activation.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp b/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp
index 799e95d..7be3cda 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/arm.hpp b/arm_compute/core/NEON/kernels/convolution/common/arm.hpp
index 90e7828..b19bf98 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/arm.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/arm.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp b/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp
index 2ab2597..b141352 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/padding.hpp b/arm_compute/core/NEON/kernels/convolution/common/padding.hpp
index 97b21e0..b6f9587 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/padding.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/padding.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/perf.h b/arm_compute/core/NEON/kernels/convolution/common/perf.h
index 3c0d366..fbae4dc 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/perf.h
+++ b/arm_compute/core/NEON/kernels/convolution/common/perf.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
index 6029cb6..88ef732 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp
index 41bfbe4..726a02c 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/shims.hpp b/arm_compute/core/NEON/kernels/convolution/common/shims.hpp
index 243d305..310bd47 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/shims.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/shims.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp b/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
index ad0a677..7738cdb 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp b/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp
index 0c23443..82619f4 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp b/arm_compute/core/NEON/kernels/convolution/common/utils.hpp
index 99b2282..b7a9517 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/common/utils.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
index a4a833d..70d6689 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
index e0d7f0c..1bae815 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
index 37c1f1b..4343f6a 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
index cf1c6f5..a11b098 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index 4861559..067a18c 100644
--- a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,7 @@
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
+
     /** Run activation function.
      *
      * @param[in] vval Vector of values.
@@ -53,6 +54,15 @@
     {
         ARM_COMPUTE_UNUSED(vval);
     }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        ARM_COMPUTE_UNUSED(val);
+    }
 };
 /** Linear activation object */
 template <typename T, int S>
@@ -68,8 +78,10 @@
      * @param[in] act_info Activation layer information.
      */
     explicit linear(ActivationLayerInfo act_info)
-        : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
-          vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
+        : alpha(act_info.a()),
+          beta(act_info.b()),
+          valpha(wrapper::vdup_n(static_cast<T>(alpha), ExactTagType{})),
+          vbeta(wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}))
     {
     }
 
@@ -79,13 +91,22 @@
      */
     void operator()(ExactType &vval)
     {
-        vval = wrapper::vmla(vval, valpha, vbeta);
+        vval = wrapper::vmla(vbeta, vval, valpha);
     }
 
-    /** Vector of alphas. */
-    const ExactType valpha;
-    /** Vector of betas. */
-    const ExactType vbeta;
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = alpha * val + beta;
+    }
+
+    const T         alpha;  /**< Scalar alpha */
+    const T         beta;   /**< Scalar alpha */
+    const ExactType valpha; /**< Vector of alphas. */
+    const ExactType vbeta;  /**< Vector of betas. */
 };
 /** Square activation object */
 template <typename T, int S>
@@ -113,6 +134,15 @@
     {
         vval = wrapper::vmul(vval, vval);
     }
+
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = val * val;
+    }
 };
 /** Logistic activation object */
 template <typename T, int S>
@@ -128,7 +158,7 @@
      * @param[in] act_info Activation layer information.
      */
     explicit logistic(ActivationLayerInfo act_info)
-        : vone(wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}))
+        : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
@@ -142,6 +172,15 @@
         vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vneg(vval))));
     }
 
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = 1 / (1 + std::exp(-val));
+    }
+
     /** Vector of ones. */
     const ExactType vone;
 };
@@ -159,7 +198,7 @@
      * @param[in] act_info Activation layer information.
      */
     explicit relu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}))
+        : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
@@ -173,6 +212,15 @@
         vval = wrapper::vmax(vzero, vval);
     }
 
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = std::max(static_cast<T>(0), val);
+    }
+
     /** Vector of zeroes. */
     const ExactType vzero;
 };
@@ -190,7 +238,8 @@
      * @param[in] act_info Activation layer information.
      */
     explicit brelu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{})),
+        : alpha(act_info.a()),
+          vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{})),
           valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{}))
     {
     }
@@ -204,10 +253,18 @@
         vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval));
     }
 
-    /** Vector of zeroes. */
-    const ExactType vzero;
-    /** Vector of alphas. */
-    const ExactType valpha;
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = std::min(alpha, std::max(static_cast<T>(0), val));
+    }
+
+    const T         alpha;  /** Scalar alpha */
+    const ExactType vzero;  /** Vector of zeroes. */
+    const ExactType valpha; /** Vector of alphas. */
 };
 /** Lower-Upper Bounded RELU activation object */
 template <typename T, int S>
@@ -223,7 +280,9 @@
      * @param[in] act_info Activation layer information.
      */
     explicit lubrelu(ActivationLayerInfo act_info)
-        : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
+        : alpha(act_info.a()),
+          beta(act_info.b()),
+          valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
           vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
     {
     }
@@ -237,10 +296,19 @@
         vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval));
     }
 
-    /** Vector of alphas. */
-    const ExactType valpha;
-    /** Vector of betas. */
-    const ExactType vbeta;
+    /** Run activation function.
+     *
+     * @param[in] val Scalar value.
+     */
+    void operator()(T &val)
+    {
+        val = std::min(alpha, std::max(beta, val));
+    }
+
+    const T         alpha;  /**< Scalar alpha */
+    const T         beta;   /**< Scalar alpha */
+    const ExactType valpha; /** Vector of alphas. */
+    const ExactType vbeta;  /** Vector of betas. */
 };
 } // namespace detail
 } // namespace arm_compute

diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index d756a9a..41ad8fc 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index d4cbc7f..78f08fd 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/abs.h b/arm_compute/core/NEON/wrapper/intrinsics/abs.h
index aff1816..6927fa6 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/abs.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/abs.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/add.h b/arm_compute/core/NEON/wrapper/intrinsics/add.h
index 776e136..5bca891 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/add.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/add.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/and.h b/arm_compute/core/NEON/wrapper/intrinsics/and.h
index 1973c55..8fffe35 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/and.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/and.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
index 3c26a9c..6d01b8a 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
index f8a8f91..a84984d 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cge.h b/arm_compute/core/NEON/wrapper/intrinsics/cge.h
index bf231b8..ac2973b 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/cge.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/cge.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
index 5202a5b..c7ae2ca 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cle.h b/arm_compute/core/NEON/wrapper/intrinsics/cle.h
new file mode 100644
index 0000000..50c175f
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/cle.h

@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_CLE_H
+#define ARM_COMPUTE_WRAPPER_CLE_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VCLE_IMPL(stype, vtype, rtype, prefix, postfix) \
+    inline rtype vcle(const vtype &a, const vtype &b)   \
+    {                                                   \
+        return prefix##_##postfix(a, b);                \
+    }
+
+VCLE_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcle, u8)
+VCLE_IMPL(int8_t, int8x8_t, uint8x8_t, vcle, s8)
+VCLE_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcle, u16)
+VCLE_IMPL(int16_t, int16x4_t, uint16x4_t, vcle, s16)
+VCLE_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcle, u32)
+VCLE_IMPL(int32_t, int32x2_t, uint32x2_t, vcle, s32)
+VCLE_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcle, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCLE_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcle, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+VCLE_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcleq, u8)
+VCLE_IMPL(int8_t, int8x16_t, uint8x16_t, vcleq, s8)
+VCLE_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcleq, u16)
+VCLE_IMPL(int16_t, int16x8_t, uint16x8_t, vcleq, s16)
+VCLE_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcleq, u32)
+VCLE_IMPL(int32_t, int32x4_t, uint32x4_t, vcleq, s32)
+VCLE_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcleq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCLE_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcleq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#undef VCLE_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_CLE_H */

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/clt.h b/arm_compute/core/NEON/wrapper/intrinsics/clt.h
index 4701ab7..2d1ea28 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/clt.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/clt.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/combine.h b/arm_compute/core/NEON/wrapper/intrinsics/combine.h
index 9099e28..c9d5bf8 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/combine.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/combine.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h b/arm_compute/core/NEON/wrapper/intrinsics/cvt.h
index 5ea9a5d..6e79a92 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/cvt.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,8 +40,24 @@
 
 VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32)
 VCVT_TO_F32_IMPL(float32x4_t, int32x4_t, vcvtq, f32, s32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VCVT_TO_F32_IMPL(float32x4_t, float16x4_t, vcvt, f32, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #undef VCVT_TO_F32_IMPL
 
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2)                       \
+    template <typename T>                                                                \
+    inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type \
+    vcvt(const vtype &a)                                                                 \
+    {                                                                                    \
+        return prefix##_##postfix1##_##postfix2(a);                                      \
+    }
+
+VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
+#undef VCVT_TO_F16_IMPL
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
 template <typename T>
 inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint32x4_t>::type
 vcvt(const float32x4_t &a)

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/div.h b/arm_compute/core/NEON/wrapper/intrinsics/div.h
index d49a911..5731aba 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/div.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/div.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h b/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
index ffbfde7..80d4c40 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/eor.h b/arm_compute/core/NEON/wrapper/intrinsics/eor.h
index a0e7b68..227a743 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/eor.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/eor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/exp.h b/arm_compute/core/NEON/wrapper/intrinsics/exp.h
index f079af0..d50824b 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/exp.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/exp.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,10 +37,18 @@
         return vexpq_##postfix(a);     \
     }
 
+#define VEXPQ_IMPL_INT(vtype, postfix)      \
+    inline vtype vexpq(const vtype &a)      \
+    {                                       \
+        ARM_COMPUTE_UNUSED(a);              \
+        ARM_COMPUTE_ERROR("Not supported"); \
+    }
+
 VEXPQ_IMPL(float32x4_t, f32)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 VEXPQ_IMPL(float16x8_t, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VEXPQ_IMPL_INT(int32x4_t, s32)
 #undef VEXPQ_IMPL
 
 } // namespace wrapper

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ext.h b/arm_compute/core/NEON/wrapper/intrinsics/ext.h
index f2c3dcc..d44b231 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/ext.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/ext.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h b/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
index 13d2967..d98e129 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h b/arm_compute/core/NEON/wrapper/intrinsics/getlane.h
index 533bf63..2052751 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/getlane.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h b/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
index dbc3d86..b85b6ca 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/getlow.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
index 51b1fcc..a30e723 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,7 @@
 #include "arm_compute/core/NEON/wrapper/intrinsics/ceq.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/cge.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/cgt.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/cle.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/clt.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/combine.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/cvt.h"
@@ -58,6 +59,7 @@
 #include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/pow.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/qmov.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/qmovun.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h"

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/inv.h b/arm_compute/core/NEON/wrapper/intrinsics/inv.h
index 9da66ba..889d176 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/inv.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/inv.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h b/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h
index 2bf9f52..8269afe 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,10 +37,18 @@
         return prefix##_##postfix(a);                \
     }
 
+#define VINVSQRT_IMPL_INT(stype, vtype, prefix, postfix) \
+    inline vtype vinvsqrt(const vtype &a)                \
+    {                                                    \
+        ARM_COMPUTE_UNUSED(a);                           \
+        ARM_COMPUTE_ERROR("Not supported");              \
+    }
+
 VINVSQRT_IMPL(float, float32x2_t, vinvsqrt, f32)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 VINVSQRT_IMPL(float16_t, float16x4_t, vinvsqrt, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VINVSQRT_IMPL_INT(int, int32x4_t, vinvsqrt, s32)
 
 VINVSQRT_IMPL(float, float32x4_t, vinvsqrtq, f32)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/load.h b/arm_compute/core/NEON/wrapper/intrinsics/load.h
index d38350f..0fdf705 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/load.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/load.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/log.h b/arm_compute/core/NEON/wrapper/intrinsics/log.h
index bb4181e..83de420 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/log.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/log.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,10 +37,19 @@
         return prefix##_##postfix(a);     \
     }
 
+#define VLOG_IMPL_INT(vtype, prefix, postfix) \
+    inline vtype vlog(const vtype &a)         \
+    {                                         \
+        ARM_COMPUTE_UNUSED(a);                \
+        ARM_COMPUTE_ERROR("Not supported");   \
+    }
+
 VLOG_IMPL(float32x4_t, vlogq, f32)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 VLOG_IMPL(float16x8_t, vlogq, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VLOG_IMPL_INT(int32x4_t, vlogq, s32)
+
 #undef VLOG_IMPL
 } // namespace wrapper
 } // namespace arm_compute

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/max.h b/arm_compute/core/NEON/wrapper/intrinsics/max.h
index a87b7a3..7e52089 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/max.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/max.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/min.h b/arm_compute/core/NEON/wrapper/intrinsics/min.h
index dc8a127..b287598 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/min.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/min.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mla.h b/arm_compute/core/NEON/wrapper/intrinsics/mla.h
index dd2f0c0..2c89cfd 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/mla.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/mla.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movl.h b/arm_compute/core/NEON/wrapper/intrinsics/movl.h
index 982a795..fd97a44 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/movl.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/movl.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movn.h b/arm_compute/core/NEON/wrapper/intrinsics/movn.h
index 23360e2..ed3b159 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/movn.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/movn.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mul.h b/arm_compute/core/NEON/wrapper/intrinsics/mul.h
index bbf70ab..88ea87a 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/mul.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/mul.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/neg.h b/arm_compute/core/NEON/wrapper/intrinsics/neg.h
index da2f285..c0c73dc 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/neg.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/neg.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/not.h b/arm_compute/core/NEON/wrapper/intrinsics/not.h
index 5b1e405..084b2a4 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/not.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/not.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/orr.h b/arm_compute/core/NEON/wrapper/intrinsics/orr.h
index 0fbdd44..13979fe 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/orr.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/orr.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
index afad27f..ba8d9cc 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
index 77c5cf6..45e64a8 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pow.h b/arm_compute/core/NEON/wrapper/intrinsics/pow.h
index 1b5d62d..bffbc4f 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/pow.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/pow.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmov.h b/arm_compute/core/NEON/wrapper/intrinsics/qmov.h
new file mode 100644
index 0000000..167f3cf
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/qmov.h

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_QMOV_H
+#define ARM_COMPUTE_WRAPPER_QMOV_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type
+vqmov(const int16x8_t &a)
+{
+    return vqmovun_s16(a);
+}
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type
+vqmov(const int16x8_t &a)
+{
+    return vqmovn_s16(a);
+}
+
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_QMOV_H */

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h b/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h
index a034702..f823ddb 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
index 579da34..0c26cd9 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/rev64.h b/arm_compute/core/NEON/wrapper/intrinsics/rev64.h
index 0385704..1119c34 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/rev64.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/rev64.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/round.h b/arm_compute/core/NEON/wrapper/intrinsics/round.h
index f3e0fe1..dd068ea 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/round.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/round.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,10 +37,18 @@
         return vroundq_rte_##postfix(a); \
     }
 
+#define VROUNDQ_IMPL_INT(vtype, postfix)    \
+    inline vtype vround(const vtype &a)     \
+    {                                       \
+        ARM_COMPUTE_UNUSED(a);              \
+        ARM_COMPUTE_ERROR("Not supported"); \
+    }
+
 VROUNDQ_IMPL(float32x4_t, f32)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 VROUNDQ_IMPL(float16x8_t, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VROUNDQ_IMPL_INT(int32x4_t, s32)
 #undef VROUNDQ_IMPL
 
 } // namespace wrapper

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/setlane.h b/arm_compute/core/NEON/wrapper/intrinsics/setlane.h
index 6332f30..197eeda 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/setlane.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/setlane.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sin.h b/arm_compute/core/NEON/wrapper/intrinsics/sin.h
index e0fe5fb..7c9cc46 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/sin.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/sin.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,11 +37,20 @@
         return prefix##_##postfix(a);     \
     }
 
+#define VSIN_IMPL_INT(vtype, prefix, postfix) \
+    inline vtype vsin(const vtype &a)         \
+    {                                         \
+        ARM_COMPUTE_UNUSED(a);                \
+        ARM_COMPUTE_ERROR("Not supported");   \
+    }
+
 VSIN_IMPL(float32x4_t, vsinq, f32)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 VSIN_IMPL(float16x8_t, vsinq, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
+VSIN_IMPL_INT(int32x4_t, vsinq, s32)
+
 #undef vsub_IMPL
 } // namespace wrapper
 } // namespace arm_compute

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/store.h b/arm_compute/core/NEON/wrapper/intrinsics/store.h
index eb2ae6a..6dda432 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/store.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/store.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sub.h b/arm_compute/core/NEON/wrapper/intrinsics/sub.h
index 2c6c961..475986d 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/sub.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/sub.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,6 +64,7 @@
 
 #undef VSUB_IMPL
 
+// VQSUB: Vector saturating sub (No notion of saturation for floating point)
 #define VQSUB_IMPL(stype, vtype, prefix, postfix)      \
     inline vtype vqsub(const vtype &a, const vtype &b) \
     {                                                  \
@@ -78,6 +79,10 @@
 VQSUB_IMPL(int32x2_t, int32x2_t, vqsub, s32)
 VQSUB_IMPL(uint64x1_t, uint64x1_t, vqsub, u64)
 VQSUB_IMPL(int64x1_t, int64x1_t, vqsub, s64)
+VQSUB_IMPL(float32x2_t, float32x2_t, vsub, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VQSUB_IMPL(float16x4_t, float16x4_t, vsub, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 VQSUB_IMPL(uint8x16_t, uint8x16_t, vqsubq, u8)
 VQSUB_IMPL(int8x16_t, int8x16_t, vqsubq, s8)
@@ -87,8 +92,12 @@
 VQSUB_IMPL(int32x4_t, int32x4_t, vqsubq, s32)
 VQSUB_IMPL(uint64x2_t, uint64x2_t, vqsubq, u64)
 VQSUB_IMPL(int64x2_t, int64x2_t, vqsubq, s64)
-
+VQSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32)
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+VQSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16)
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #undef VQSUB_IMPL
+
 } // namespace wrapper
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_WRAPPER_SUB_H */

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h b/arm_compute/core/NEON/wrapper/intrinsics/tanh.h
index 648a001..2943b9b 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/tanh.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tbl.h b/arm_compute/core/NEON/wrapper/intrinsics/tbl.h
index d3d6b72..05e6c1f 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/tbl.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/tbl.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/scalar/add.h b/arm_compute/core/NEON/wrapper/scalar/add.h
index 5a04fe2..642d926 100644
--- a/arm_compute/core/NEON/wrapper/scalar/add.h
+++ b/arm_compute/core/NEON/wrapper/scalar/add.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,6 +44,13 @@
     return vget_lane_s16(vqadd_s16(va, vb), 0);
 }
 
+inline int32_t add_sat(const int32_t &a, const int32_t &b)
+{
+    const int32x2_t va = { a, 0 };
+    const int32x2_t vb = { b, 0 };
+    return vget_lane_s32(vqadd_s32(va, vb), 0);
+}
+
 inline float add_sat(const float &a, const float &b)
 {
     // No notion of saturation exists in floating point

diff --git a/arm_compute/core/NEON/wrapper/scalar/scalar.h b/arm_compute/core/NEON/wrapper/scalar/scalar.h
index c8bd473..1bc50c2 100644
--- a/arm_compute/core/NEON/wrapper/scalar/scalar.h
+++ b/arm_compute/core/NEON/wrapper/scalar/scalar.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,5 +25,6 @@
 #define ARM_COMPUTE_WRAPPER_SCALAR_H
 
 #include "arm_compute/core/NEON/wrapper/scalar/add.h"
+#include "arm_compute/core/NEON/wrapper/scalar/sub.h"
 
 #endif /* ARM_COMPUTE_WRAPPER_SCALAR_H */

diff --git a/arm_compute/core/NEON/wrapper/scalar/sub.h b/arm_compute/core/NEON/wrapper/scalar/sub.h
new file mode 100644
index 0000000..9abda26
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/scalar/sub.h

@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_WRAPPER_SCALAR_SUB_H
+#define ARM_COMPUTE_WRAPPER_SCALAR_SUB_H
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b)
+{
+    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    return vget_lane_u8(vqsub_u8(va, vb), 0);
+}
+
+inline int16_t sub_sat(const int16_t &a, const int16_t &b)
+{
+    const int16x4_t va = { a, 0, 0, 0 };
+    const int16x4_t vb = { b, 0, 0, 0 };
+    return vget_lane_s16(vqsub_s16(va, vb), 0);
+}
+
+inline float sub_sat(const float &a, const float &b)
+{
+    // No notion of saturation exists in floating point
+    return a - b;
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+inline float16_t sub_sat(const float16_t &a, const float16_t &b)
+{
+    // No notion of saturation exists in floating point
+    return a - b;
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_WRAPPER_SCALAR_SUB_H */

diff --git a/arm_compute/core/NEON/wrapper/traits.h b/arm_compute/core/NEON/wrapper/traits.h
index ae77d27..eafbeef 100644
--- a/arm_compute/core/NEON/wrapper/traits.h
+++ b/arm_compute/core/NEON/wrapper/traits.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/wrapper/wrapper.h b/arm_compute/core/NEON/wrapper/wrapper.h
index 99a5909..e0c2908 100644
--- a/arm_compute/core/NEON/wrapper/wrapper.h
+++ b/arm_compute/core/NEON/wrapper/wrapper.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index 337ccbc..f744d64 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/PyramidInfo.h b/arm_compute/core/PyramidInfo.h
index e8cbe34..c6bfa1b 100644
--- a/arm_compute/core/PyramidInfo.h
+++ b/arm_compute/core/PyramidInfo.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index 52ef149..af7b8c6 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Rounding.h b/arm_compute/core/Rounding.h
index 68d7429..b6817b5 100644
--- a/arm_compute/core/Rounding.h
+++ b/arm_compute/core/Rounding.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Size2D.h b/arm_compute/core/Size2D.h
index 722d745..bcd89cb 100644
--- a/arm_compute/core/Size2D.h
+++ b/arm_compute/core/Size2D.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Steps.h b/arm_compute/core/Steps.h
index 6c89185..208fc4b 100644
--- a/arm_compute/core/Steps.h
+++ b/arm_compute/core/Steps.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Strides.h b/arm_compute/core/Strides.h
index a2a7337..265799e 100644
--- a/arm_compute/core/Strides.h
+++ b/arm_compute/core/Strides.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index bcb570a..f604f55 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 68570d5..31f2732 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
index 57d8f6c..2187743 100644
--- a/arm_compute/core/TensorShape.h
+++ b/arm_compute/core/TensorShape.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/TracePoint.h b/arm_compute/core/TracePoint.h
index 6951d6d..799d62e 100644
--- a/arm_compute/core/TracePoint.h
+++ b/arm_compute/core/TracePoint.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 759ff07..9750500 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1912,15 +1912,16 @@
 struct GEMMRHSMatrixInfo
 {
     GEMMRHSMatrixInfo() = default;
-    GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter)
-        : n0(n), k0(k), h0(h), transpose(trans), interleave(inter)
+    GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter, bool export_to_cl_img)
+        : n0(n), k0(k), h0(h), transpose(trans), interleave(inter), export_to_cl_image(export_to_cl_img)
     {
     }
-    unsigned int n0{ 1 };            /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };            /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int h0{ 1 };            /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         transpose{ true };  /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         interleave{ true }; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    unsigned int n0{ 1 };                     /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{ 1 };                     /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int h0{ 1 };                     /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         transpose{ true };           /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool         interleave{ true };          /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         export_to_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
 /** GEMM information class. This class stores the necessary information to compute GEMM functions

diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index eff6157..7a1cc99 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -981,16 +981,6 @@
  */
 QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool is_log);
 
-/** Returns resize ratio between input and output with consideration of aligned corners
- *
- * @param[in] input_size    The input size
- * @param[in] output_size   the output size
- * @param[in] align_corners True to align corners of input and output. Defaults to false.
- *
- * @return The ratio between input and output (i.e., the input size divided by the output size)
- */
-float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners = false);
-
 /** Returns a pair of minimum and maximum values for a quantized activation
  *
  * @param[in] act_info  The information for activation

diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
index bbea5e5..68b3de5 100644
--- a/arm_compute/core/Validate.h
+++ b/arm_compute/core/Validate.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Version.h b/arm_compute/core/Version.h
index be3f026..3a2c783 100644
--- a/arm_compute/core/Version.h
+++ b/arm_compute/core/Version.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index d6690d4..2ba5440 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index 70c4f80..14a432a 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/WindowIterator.h b/arm_compute/core/WindowIterator.h
index e7d5334..0967cef 100644
--- a/arm_compute/core/WindowIterator.h
+++ b/arm_compute/core/WindowIterator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
new file mode 100644
index 0000000..4dee5ff
--- /dev/null
+++ b/arm_compute/core/experimental/Types.h

@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_EXPERIMENTAL_TYPES_H
+#define ARM_COMPUTE_EXPERIMENTAL_TYPES_H
+
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+
+/** Memory type */
+enum TensorType : int32_t
+{
+    ACL_UNKNOWN = -1,
+    ACL_SRC     = 0,
+    ACL_SRC_0   = 0,
+    ACL_SRC_1   = 1,
+    ACL_SRC_2   = 2,
+    ACL_DST     = 30,
+    ACL_DST_0   = 30,
+    ACL_DST_1   = 31,
+    ACL_INT     = 50,
+    ACL_INT_0   = 50,
+    ACL_INT_1   = 51,
+    ACL_INT_2   = 52,
+    ACL_SRC_VEC = 256,
+};
+
+namespace experimental
+{
+struct MemoryInfo
+{
+    MemoryInfo(TensorType type, size_t size, size_t alignment)
+        : type(type), size(size), alignment(alignment)
+    {
+    }
+    TensorType type;
+    size_t     size;
+    size_t     alignment;
+};
+
+using MemoryRequirements = std::vector<MemoryInfo>;
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_EXPERIMENTAL_TYPES_H */

diff --git a/arm_compute/core/utils/helpers/bit_ops.h b/arm_compute/core/utils/helpers/bit_ops.h
index 6dbca17..eee360c 100644
--- a/arm_compute/core/utils/helpers/bit_ops.h
+++ b/arm_compute/core/utils/helpers/bit_ops.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/helpers/fft.h b/arm_compute/core/utils/helpers/fft.h
index b22bece..7d111b7 100644
--- a/arm_compute/core/utils/helpers/fft.h
+++ b/arm_compute/core/utils/helpers/fft.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/helpers/float_ops.h b/arm_compute/core/utils/helpers/float_ops.h
index fceee2e..1a08fc7 100644
--- a/arm_compute/core/utils/helpers/float_ops.h
+++ b/arm_compute/core/utils/helpers/float_ops.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/helpers/tensor_info.h b/arm_compute/core/utils/helpers/tensor_info.h
index da24e82..4432340 100644
--- a/arm_compute/core/utils/helpers/tensor_info.h
+++ b/arm_compute/core/utils/helpers/tensor_info.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/helpers/tensor_transform.h b/arm_compute/core/utils/helpers/tensor_transform.h
index 7e912a6..faa5b44 100644
--- a/arm_compute/core/utils/helpers/tensor_transform.h
+++ b/arm_compute/core/utils/helpers/tensor_transform.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/io/FileHandler.h b/arm_compute/core/utils/io/FileHandler.h
index ebc2ef0..615651d 100644
--- a/arm_compute/core/utils/io/FileHandler.h
+++ b/arm_compute/core/utils/io/FileHandler.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/logging/FilePrinter.h b/arm_compute/core/utils/logging/FilePrinter.h
index 73a5421..0e5b84f 100644
--- a/arm_compute/core/utils/logging/FilePrinter.h
+++ b/arm_compute/core/utils/logging/FilePrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/logging/Helpers.h b/arm_compute/core/utils/logging/Helpers.h
index 341f944..08b8eb3 100644
--- a/arm_compute/core/utils/logging/Helpers.h
+++ b/arm_compute/core/utils/logging/Helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/logging/IPrinter.h b/arm_compute/core/utils/logging/IPrinter.h
index b6ede58..42dca58 100644
--- a/arm_compute/core/utils/logging/IPrinter.h
+++ b/arm_compute/core/utils/logging/IPrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/logging/LogMsgDecorators.h b/arm_compute/core/utils/logging/LogMsgDecorators.h
index 08abcb4..9c9e627 100644
--- a/arm_compute/core/utils/logging/LogMsgDecorators.h
+++ b/arm_compute/core/utils/logging/LogMsgDecorators.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/logging/Logger.h b/arm_compute/core/utils/logging/Logger.h
index 2bd467a..4fc9bb7 100644
--- a/arm_compute/core/utils/logging/Logger.h
+++ b/arm_compute/core/utils/logging/Logger.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/logging/LoggerRegistry.h b/arm_compute/core/utils/logging/LoggerRegistry.h
index c1a182c..7c9931a 100644
--- a/arm_compute/core/utils/logging/LoggerRegistry.h
+++ b/arm_compute/core/utils/logging/LoggerRegistry.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/logging/Macros.h b/arm_compute/core/utils/logging/Macros.h
index e4d9734..6a1b761 100644
--- a/arm_compute/core/utils/logging/Macros.h
+++ b/arm_compute/core/utils/logging/Macros.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/logging/Printers.h b/arm_compute/core/utils/logging/Printers.h
index e09880c..80493e7 100644
--- a/arm_compute/core/utils/logging/Printers.h
+++ b/arm_compute/core/utils/logging/Printers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/logging/StdPrinter.h b/arm_compute/core/utils/logging/StdPrinter.h
index ea41ce2..eb0e78e 100644
--- a/arm_compute/core/utils/logging/StdPrinter.h
+++ b/arm_compute/core/utils/logging/StdPrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/logging/Types.h b/arm_compute/core/utils/logging/Types.h
index 838adf9..f0ddae6 100644
--- a/arm_compute/core/utils/logging/Types.h
+++ b/arm_compute/core/utils/logging/Types.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/math/SafeOps.h b/arm_compute/core/utils/math/SafeOps.h
index 41bbb12..4f81cf4 100644
--- a/arm_compute/core/utils/math/SafeOps.h
+++ b/arm_compute/core/utils/math/SafeOps.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/CRTP.h b/arm_compute/core/utils/misc/CRTP.h
index 037c69a..d295500 100644
--- a/arm_compute/core/utils/misc/CRTP.h
+++ b/arm_compute/core/utils/misc/CRTP.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/Cast.h b/arm_compute/core/utils/misc/Cast.h
index fc6246a..57c7e49 100644
--- a/arm_compute/core/utils/misc/Cast.h
+++ b/arm_compute/core/utils/misc/Cast.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/ICloneable.h b/arm_compute/core/utils/misc/ICloneable.h
index 064f408..cbb0b3c 100644
--- a/arm_compute/core/utils/misc/ICloneable.h
+++ b/arm_compute/core/utils/misc/ICloneable.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/InfoHelpers.h b/arm_compute/core/utils/misc/InfoHelpers.h
index c6ee7c9..ced0d24 100644
--- a/arm_compute/core/utils/misc/InfoHelpers.h
+++ b/arm_compute/core/utils/misc/InfoHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,7 +86,7 @@
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
 
-        const ITensorInfo *cell_to_input_weights_info = (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
+        ITensorInfo *cell_to_input_weights_info = (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
         lstm_params_info->set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
                                           cell_to_input_weights_info, lstm_params.input_gate_bias()->info());
     }
@@ -100,10 +100,10 @@
             ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_layer_norm_weights());
         }
 
-        const ITensorInfo *forget_info = lstm_params.forget_layer_norm_weights()->info();
-        const ITensorInfo *cell_info   = lstm_params.cell_layer_norm_weights()->info();
-        const ITensorInfo *output_info = lstm_params.output_layer_norm_weights()->info();
-        const ITensorInfo *input_info  = lstm_params.has_cifg_opt() ? nullptr : lstm_params.input_layer_norm_weights()->info();
+        ITensorInfo *forget_info = lstm_params.forget_layer_norm_weights()->info();
+        ITensorInfo *cell_info   = lstm_params.cell_layer_norm_weights()->info();
+        ITensorInfo *output_info = lstm_params.output_layer_norm_weights()->info();
+        ITensorInfo *input_info  = lstm_params.has_cifg_opt() ? nullptr : lstm_params.input_layer_norm_weights()->info();
 
         lstm_params_info->set_layer_normalization_params(input_info, forget_info, cell_info, output_info);
     }

diff --git a/arm_compute/core/utils/misc/Iterable.h b/arm_compute/core/utils/misc/Iterable.h
index 829c4b4..3423208 100644
--- a/arm_compute/core/utils/misc/Iterable.h
+++ b/arm_compute/core/utils/misc/Iterable.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/MMappedFile.h b/arm_compute/core/utils/misc/MMappedFile.h
index 7669c5c..b3e0994 100644
--- a/arm_compute/core/utils/misc/MMappedFile.h
+++ b/arm_compute/core/utils/misc/MMappedFile.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/Macros.h b/arm_compute/core/utils/misc/Macros.h
index 6e8d765..de66b6a 100644
--- a/arm_compute/core/utils/misc/Macros.h
+++ b/arm_compute/core/utils/misc/Macros.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/Random.h b/arm_compute/core/utils/misc/Random.h
index 9f5a128..6832c49 100644
--- a/arm_compute/core/utils/misc/Random.h
+++ b/arm_compute/core/utils/misc/Random.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/Requires.h b/arm_compute/core/utils/misc/Requires.h
index 33c6fa3..ba91039 100644
--- a/arm_compute/core/utils/misc/Requires.h
+++ b/arm_compute/core/utils/misc/Requires.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/Rounding.h b/arm_compute/core/utils/misc/Rounding.h
index 650137a..1ed4e64 100644
--- a/arm_compute/core/utils/misc/Rounding.h
+++ b/arm_compute/core/utils/misc/Rounding.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/SaturateCast.h b/arm_compute/core/utils/misc/SaturateCast.h
index 0241c64..cbced83 100644
--- a/arm_compute/core/utils/misc/SaturateCast.h
+++ b/arm_compute/core/utils/misc/SaturateCast.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index dfccec8..0be4caf 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -809,6 +809,37 @@
     return output_shape;
 }
 
+/** Calculate the output unpool shape of a tensor
+ *
+ * @param[in] input     Input tensor info
+ * @param[in] pool_info Pooling layer info
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_unpool_shape(const ITensorInfo &input, PoolingLayerInfo pool_info)
+{
+    const unsigned int idx_width   = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int idx_height  = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
+    const TensorShape  input_shape = input.tensor_shape();
+    ARM_COMPUTE_ERROR_ON(input_shape[idx_height] <= 1 || input_shape[idx_width] <= 1);
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    const unsigned int  stride_x        = pad_stride_info.stride().first;
+    const unsigned int  stride_y        = pad_stride_info.stride().second;
+
+    const int pad_left   = pad_stride_info.pad_left();
+    const int pad_top    = pad_stride_info.pad_top();
+    const int pad_right  = pad_stride_info.pad_right();
+    const int pad_bottom = pad_stride_info.pad_bottom();
+
+    TensorShape        output_shape = input_shape;
+    const unsigned int out_width    = (input_shape[idx_width] - 1) * stride_x - pad_left - pad_right + pool_info.pool_size.width;
+    const unsigned int out_height   = (input_shape[idx_height] - 1) * stride_y - pad_top - pad_bottom + pool_info.pool_size.height;
+
+    output_shape.set(idx_width, out_width);
+    output_shape.set(idx_height, out_height);
+    return output_shape;
+}
+
 /** Calculate the output roi align shape of a tensor
  *
  * @param[in] input     Input tensor info

diff --git a/arm_compute/core/utils/misc/Traits.h b/arm_compute/core/utils/misc/Traits.h
index 1cbdbfe..58fb1bf 100644
--- a/arm_compute/core/utils/misc/Traits.h
+++ b/arm_compute/core/utils/misc/Traits.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/misc/Utility.h b/arm_compute/core/utils/misc/Utility.h
index b2bb63f..646d665 100644
--- a/arm_compute/core/utils/misc/Utility.h
+++ b/arm_compute/core/utils/misc/Utility.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index 4ef4947..cbf7559 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph.h b/arm_compute/graph.h
index 37c5220..6bb6f14 100644
--- a/arm_compute/graph.h
+++ b/arm_compute/graph.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/Edge.h b/arm_compute/graph/Edge.h
index e40914e..5e81b9c 100644
--- a/arm_compute/graph/Edge.h
+++ b/arm_compute/graph/Edge.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/Graph.h b/arm_compute/graph/Graph.h
index dce92c6..0cdd8f8 100644
--- a/arm_compute/graph/Graph.h
+++ b/arm_compute/graph/Graph.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/GraphBuilder.h b/arm_compute/graph/GraphBuilder.h
index 612703c..bce1ce4 100644
--- a/arm_compute/graph/GraphBuilder.h
+++ b/arm_compute/graph/GraphBuilder.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/GraphContext.h b/arm_compute/graph/GraphContext.h
index 973264a..7beb598 100644
--- a/arm_compute/graph/GraphContext.h
+++ b/arm_compute/graph/GraphContext.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/GraphManager.h b/arm_compute/graph/GraphManager.h
index 23fa732..ae48e81 100644
--- a/arm_compute/graph/GraphManager.h
+++ b/arm_compute/graph/GraphManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/IDeviceBackend.h b/arm_compute/graph/IDeviceBackend.h
index d40d752..3a0a7e6 100644
--- a/arm_compute/graph/IDeviceBackend.h
+++ b/arm_compute/graph/IDeviceBackend.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/IGraphMutator.h b/arm_compute/graph/IGraphMutator.h
index 94a28f6..1c68b17 100644
--- a/arm_compute/graph/IGraphMutator.h
+++ b/arm_compute/graph/IGraphMutator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/IGraphPrinter.h b/arm_compute/graph/IGraphPrinter.h
index 73f3fa8..9ee11d5 100644
--- a/arm_compute/graph/IGraphPrinter.h
+++ b/arm_compute/graph/IGraphPrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/INode.h b/arm_compute/graph/INode.h
index 5536bb9..b920034 100644
--- a/arm_compute/graph/INode.h
+++ b/arm_compute/graph/INode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/INodeVisitor.h b/arm_compute/graph/INodeVisitor.h
index deaa8f1..338de7a 100644
--- a/arm_compute/graph/INodeVisitor.h
+++ b/arm_compute/graph/INodeVisitor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/ITensorAccessor.h b/arm_compute/graph/ITensorAccessor.h
index f7add37..96bd499 100644
--- a/arm_compute/graph/ITensorAccessor.h
+++ b/arm_compute/graph/ITensorAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/ITensorHandle.h b/arm_compute/graph/ITensorHandle.h
index 1d78e78..0908d64 100644
--- a/arm_compute/graph/ITensorHandle.h
+++ b/arm_compute/graph/ITensorHandle.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/LayerDescriptors.h b/arm_compute/graph/LayerDescriptors.h
index d8e6a6a..c11174f 100644
--- a/arm_compute/graph/LayerDescriptors.h
+++ b/arm_compute/graph/LayerDescriptors.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,6 +89,31 @@
     ActivationLayerInfo fused_activation; /**< Fused activation info */
 };
 
+/** Unary Elementwise layer descriptor */
+struct UnaryEltwiseLayerDescriptor
+{
+    /** Constructor
+     *
+     * @param[in] op               Unary element-wise operation to perform
+     * @param[in] out_quant_info   (Optional) Output quantization information. Defaults to empty @ref QuantizationInfo
+     * @param[in] c_policy         (Optional) Convert policy used for the operation. Defaults to @ref ConvertPolicy::SATURATE
+     * @param[in] r_policy         (Optional) Rounding policy used for the operation. Defaults to @ref RoundingPolicy::TO_ZERO
+     * @param[in] fused_activation (Optional) Fused activation information. Defaults to empty (identity) @ref ActivationLayerInfo
+     */
+    UnaryEltwiseLayerDescriptor(UnaryEltwiseOperation op, QuantizationInfo out_quant_info = QuantizationInfo(), ConvertPolicy c_policy = ConvertPolicy::SATURATE,
+                                RoundingPolicy      r_policy         = RoundingPolicy::TO_ZERO,
+                                ActivationLayerInfo fused_activation = ActivationLayerInfo())
+        : op(op), out_quant_info(out_quant_info), c_policy(c_policy), r_policy(r_policy), fused_activation(fused_activation)
+    {
+    }
+
+    UnaryEltwiseOperation op;               /**< Unary element-wise operation to perform */
+    QuantizationInfo      out_quant_info;   /**< Output quantization information */
+    ConvertPolicy         c_policy;         /**< Convert policy */
+    RoundingPolicy        r_policy;         /**< Rounding policy */
+    ActivationLayerInfo   fused_activation; /**< Fused activation info */
+};
+
 /** Deconvolution layer descriptor */
 struct DeconvolutionLayerDescriptor
 {

diff --git a/arm_compute/graph/Logger.h b/arm_compute/graph/Logger.h
index 8aa87f0..872c650 100644
--- a/arm_compute/graph/Logger.h
+++ b/arm_compute/graph/Logger.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/PassManager.h b/arm_compute/graph/PassManager.h
index c8920ba..efdf6ab 100644
--- a/arm_compute/graph/PassManager.h
+++ b/arm_compute/graph/PassManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/Tensor.h b/arm_compute/graph/Tensor.h
index 42d5d10..de96c99 100644
--- a/arm_compute/graph/Tensor.h
+++ b/arm_compute/graph/Tensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/TensorDescriptor.h b/arm_compute/graph/TensorDescriptor.h
index 0cd892b..6c6f99d 100644
--- a/arm_compute/graph/TensorDescriptor.h
+++ b/arm_compute/graph/TensorDescriptor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/TypeLoader.h b/arm_compute/graph/TypeLoader.h
index e1b920c..a53af40 100644
--- a/arm_compute/graph/TypeLoader.h
+++ b/arm_compute/graph/TypeLoader.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/TypePrinter.h b/arm_compute/graph/TypePrinter.h
index d56407b..5d6c9f3 100644
--- a/arm_compute/graph/TypePrinter.h
+++ b/arm_compute/graph/TypePrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -98,6 +98,9 @@
         case NodeType::EltwiseLayer:
             os << "EltwiseLayer";
             break;
+        case NodeType::UnaryEltwiseLayer:
+            os << "UnaryEltwiseLayer";
+            break;
         case NodeType::FlattenLayer:
             os << "FlattenLayer";
             break;

diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index 296f757..3a4d0a6 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,7 +79,7 @@
 /** Graph configuration structure */
 struct GraphConfig
 {
-    bool        use_function_memory_manager{ true };   /**< Use a memory manager to manage per-funcion auxilary memory */
+    bool        use_function_memory_manager{ true };   /**< Use a memory manager to manage per-function auxilary memory */
     bool        use_function_weights_manager{ true };  /**< Use a weights manager to manage transformed weights */
     bool        use_transition_memory_manager{ true }; /**< Use a memory manager to manager transition buffer memory */
     bool        use_tuner{ false };                    /**< Use a tuner in tunable backends */
@@ -103,7 +103,13 @@
 {
     Add, /**< Arithmetic addition */
     Sub, /**< Arithmetic subtraction */
-    Mul  /**< Arithmetic multiplication */
+    Mul, /**< Arithmetic multiplication */
+};
+
+/** Supported Unary Element-wise operations */
+enum class UnaryEltwiseOperation
+{
+    Exp /**< Exp */
 };
 
 /** Supported Convolution layer methods */
@@ -168,6 +174,7 @@
     SplitLayer,
     StackLayer,
     UpsampleLayer,
+    UnaryEltwiseLayer,
     YOLOLayer,
 
     Input,

diff --git a/arm_compute/graph/Utils.h b/arm_compute/graph/Utils.h
index cb421fc..b1be51e 100644
--- a/arm_compute/graph/Utils.h
+++ b/arm_compute/graph/Utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/Workload.h b/arm_compute/graph/Workload.h
index a36c28a..5b4533c 100644
--- a/arm_compute/graph/Workload.h
+++ b/arm_compute/graph/Workload.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/algorithms/Algorithms.h b/arm_compute/graph/algorithms/Algorithms.h
index 89441c8..2088d38 100644
--- a/arm_compute/graph/algorithms/Algorithms.h
+++ b/arm_compute/graph/algorithms/Algorithms.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/algorithms/TopologicalSort.h b/arm_compute/graph/algorithms/TopologicalSort.h
index 486c0b5..25476e2 100644
--- a/arm_compute/graph/algorithms/TopologicalSort.h
+++ b/arm_compute/graph/algorithms/TopologicalSort.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/BackendRegistrar.h b/arm_compute/graph/backends/BackendRegistrar.h
index 5d1582d..902c12b 100644
--- a/arm_compute/graph/backends/BackendRegistrar.h
+++ b/arm_compute/graph/backends/BackendRegistrar.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/BackendRegistry.h b/arm_compute/graph/backends/BackendRegistry.h
index c9e84bd..c4414a2 100644
--- a/arm_compute/graph/backends/BackendRegistry.h
+++ b/arm_compute/graph/backends/BackendRegistry.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/CL/CLDeviceBackend.h b/arm_compute/graph/backends/CL/CLDeviceBackend.h
index 492dca0..a8ee25d 100644
--- a/arm_compute/graph/backends/CL/CLDeviceBackend.h
+++ b/arm_compute/graph/backends/CL/CLDeviceBackend.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/CL/CLFunctionFactory.h b/arm_compute/graph/backends/CL/CLFunctionFactory.h
index 264612c..e832f45 100644
--- a/arm_compute/graph/backends/CL/CLFunctionFactory.h
+++ b/arm_compute/graph/backends/CL/CLFunctionFactory.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/CL/CLNodeValidator.h b/arm_compute/graph/backends/CL/CLNodeValidator.h
index 2f43cd4..6e102a3 100644
--- a/arm_compute/graph/backends/CL/CLNodeValidator.h
+++ b/arm_compute/graph/backends/CL/CLNodeValidator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/CL/CLSubTensorHandle.h b/arm_compute/graph/backends/CL/CLSubTensorHandle.h
index 3379feb..3750fc8 100644
--- a/arm_compute/graph/backends/CL/CLSubTensorHandle.h
+++ b/arm_compute/graph/backends/CL/CLSubTensorHandle.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/CL/CLTensorHandle.h b/arm_compute/graph/backends/CL/CLTensorHandle.h
index 1452ef8..16e30ef 100644
--- a/arm_compute/graph/backends/CL/CLTensorHandle.h
+++ b/arm_compute/graph/backends/CL/CLTensorHandle.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 382b18a..af74834 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -383,7 +383,7 @@
     }
 
     // Extract IO and info
-    std::vector<typename TargetInfo::TensorType *> inputs;
+    std::vector<typename TargetInfo::SrcTensorType *> inputs;
     for(unsigned int i = 0; i < node.num_inputs(); ++i)
     {
         inputs.push_back(get_backing_tensor<TargetInfo>(node.input(i)));
@@ -816,6 +816,54 @@
     return RETURN_UNIQUE_PTR(func);
 }
 
+/** Create a backend unary element-wise operation layer function
+ *
+ * @tparam UnaryEltwiseFunctions Backend unary element-wise function
+ * @tparam TargetInfo       Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend unary element-wise operation layer function
+ */
+template <typename UnaryEltwiseFunctions, typename TargetInfo>
+std::unique_ptr<IFunction> create_unary_eltwise_layer(UnaryEltwiseLayerNode &node)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input      = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output     = get_backing_tensor<TargetInfo>(node.output(0));
+    const UnaryEltwiseOperation      eltwise_op = node.eltwise_descriptor().op;
+
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    std::unique_ptr<IFunction> func = nullptr;
+    std::string                func_name;
+    if(eltwise_op == UnaryEltwiseOperation::Exp)
+    {
+        std::tie(func, func_name) = create_named_function<typename UnaryEltwiseFunctions::Exp>(
+                                        std::string("Exp"),
+                                        input, output);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported unary element-wise operation!");
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << TargetInfo::TargetType
+                               << " Operation: " << func_name
+                               << " Data Type: " << input->info()->data_type()
+                               << " Shape: " << input->info()->tensor_shape()
+                               << std::endl);
+
+    return RETURN_UNIQUE_PTR(func);
+}
+
 /** Create a backend flatten layer function
  *
  * @tparam FlattenLayerFunction Backend flatten function
@@ -1401,7 +1449,7 @@
 
     // Create and configure function
     auto func = support::cpp14::make_unique<ResizeLayerFunction>();
-    func->configure(input, output, policy, BorderMode::CONSTANT);
+    func->configure(input, output, ScaleKernelInfo{ policy, BorderMode::CONSTANT });
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "

diff --git a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
index 895906d..ec03bcc 100644
--- a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
+++ b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h b/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
index 37f3eab..4f8a8da 100644
--- a/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
+++ b/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/GLES/GCDeviceBackend.h b/arm_compute/graph/backends/GLES/GCDeviceBackend.h
index a6c0bfe..41805cd 100644
--- a/arm_compute/graph/backends/GLES/GCDeviceBackend.h
+++ b/arm_compute/graph/backends/GLES/GCDeviceBackend.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/GLES/GCFunctionFactory.h b/arm_compute/graph/backends/GLES/GCFunctionFactory.h
index 289a3cb..e5c00e5 100644
--- a/arm_compute/graph/backends/GLES/GCFunctionFactory.h
+++ b/arm_compute/graph/backends/GLES/GCFunctionFactory.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/GLES/GCNodeValidator.h b/arm_compute/graph/backends/GLES/GCNodeValidator.h
index 89421f7..ab3864a 100644
--- a/arm_compute/graph/backends/GLES/GCNodeValidator.h
+++ b/arm_compute/graph/backends/GLES/GCNodeValidator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/GLES/GCTensorHandle.h b/arm_compute/graph/backends/GLES/GCTensorHandle.h
index 119731d..bf4897a 100644
--- a/arm_compute/graph/backends/GLES/GCTensorHandle.h
+++ b/arm_compute/graph/backends/GLES/GCTensorHandle.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/NEON/NEDeviceBackend.h b/arm_compute/graph/backends/NEON/NEDeviceBackend.h
index 87acc55..d0c8c27 100644
--- a/arm_compute/graph/backends/NEON/NEDeviceBackend.h
+++ b/arm_compute/graph/backends/NEON/NEDeviceBackend.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/NEON/NEFunctionFactory.h b/arm_compute/graph/backends/NEON/NEFunctionFactory.h
index b05b9f0..5d0e175 100644
--- a/arm_compute/graph/backends/NEON/NEFunctionFactory.h
+++ b/arm_compute/graph/backends/NEON/NEFunctionFactory.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/NEON/NENodeValidator.h b/arm_compute/graph/backends/NEON/NENodeValidator.h
index 578f405..5d23b8b 100644
--- a/arm_compute/graph/backends/NEON/NENodeValidator.h
+++ b/arm_compute/graph/backends/NEON/NENodeValidator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/NEON/NESubTensorHandle.h b/arm_compute/graph/backends/NEON/NESubTensorHandle.h
index 1dbc053..259be78 100644
--- a/arm_compute/graph/backends/NEON/NESubTensorHandle.h
+++ b/arm_compute/graph/backends/NEON/NESubTensorHandle.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/NEON/NETensorHandle.h b/arm_compute/graph/backends/NEON/NETensorHandle.h
index 0f1b748..86aba30 100644
--- a/arm_compute/graph/backends/NEON/NETensorHandle.h
+++ b/arm_compute/graph/backends/NEON/NETensorHandle.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/Utils.h b/arm_compute/graph/backends/Utils.h
index 4893340..0322ec5 100644
--- a/arm_compute/graph/backends/Utils.h
+++ b/arm_compute/graph/backends/Utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/backends/ValidateHelpers.h b/arm_compute/graph/backends/ValidateHelpers.h
index 673caf9..c9299838 100644
--- a/arm_compute/graph/backends/ValidateHelpers.h
+++ b/arm_compute/graph/backends/ValidateHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -579,6 +579,79 @@
     // Validate function
     return YOLOLayer::validate(input, output, node.activation_info(), node.num_classes());
 }
+/** Validates a element-wise layer node
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename EltwiseLayerFunctions>
+Status validate_eltwise_Layer(EltwiseLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract input and output
+    const arm_compute::ITensorInfo *input1         = detail::get_backing_tensor_info(node.input(0));
+    const arm_compute::ITensorInfo *input2         = detail::get_backing_tensor_info(node.input(1));
+    const arm_compute::ITensorInfo *output         = get_backing_tensor_info(node.output(0));
+    const EltwiseOperation          eltwise_op     = node.eltwise_operation();
+    const ConvertPolicy             convert_policy = node.convert_policy();
+    const RoundingPolicy            round_policy   = node.rounding_policy();
+    const ActivationLayerInfo       act_info       = node.fused_activation();
+    const QuantizationInfo          quant_info     = node.output_quant_info();
+    const float                     scale          = (quant_info.scale().empty()) ? 1.0f : quant_info.scale()[0];
+
+    // Validate function
+    if(eltwise_op == EltwiseOperation::Add)
+    {
+        return EltwiseLayerFunctions::ArithmeticAddition::validate(input1, input2, output, convert_policy, act_info);
+    }
+    else if(eltwise_op == EltwiseOperation::Sub)
+    {
+        return EltwiseLayerFunctions::ArithmeticSubtraction::validate(input1, input2, output, convert_policy, act_info);
+    }
+    else if(eltwise_op == EltwiseOperation::Mul)
+    {
+        return EltwiseLayerFunctions::PixelWiseMultiplication::validate(input1, input2, output, scale, convert_policy, round_policy, act_info);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported element-wise operation!");
+    }
+    return Status{};
+}
+/** Validates a unary element-wise layer node
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename UnaryEltwiseLayerFunctions>
+Status validate_unary_eltwise_layer(UnaryEltwiseLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract input and output
+    arm_compute::ITensorInfo   *input      = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo   *output     = get_backing_tensor_info(node.output(0));
+    const UnaryEltwiseOperation eltwise_op = node.eltwise_descriptor().op;
+
+    // Validate function
+    if(eltwise_op == UnaryEltwiseOperation::Exp)
+    {
+        return UnaryEltwiseLayerFunctions::ExpLayer::validate(input, output);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported unary element-wise operation!");
+    }
+
+    return Status{};
+}
 } // namespace detail
 } // namespace backends
 } // namespace graph

diff --git a/arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h b/arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h
index 30c084e..1f43ac4 100644
--- a/arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h
+++ b/arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/detail/ExecutionHelpers.h b/arm_compute/graph/detail/ExecutionHelpers.h
index aa1af27..b1662bb 100644
--- a/arm_compute/graph/detail/ExecutionHelpers.h
+++ b/arm_compute/graph/detail/ExecutionHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/frontend/ILayer.h b/arm_compute/graph/frontend/ILayer.h
index 30652a7..7eb405b 100644
--- a/arm_compute/graph/frontend/ILayer.h
+++ b/arm_compute/graph/frontend/ILayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/frontend/IStream.h b/arm_compute/graph/frontend/IStream.h
index e155bbc..f69d543 100644
--- a/arm_compute/graph/frontend/IStream.h
+++ b/arm_compute/graph/frontend/IStream.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/frontend/IStreamOperators.h b/arm_compute/graph/frontend/IStreamOperators.h
index 8e2ca41..deaf66d 100644
--- a/arm_compute/graph/frontend/IStreamOperators.h
+++ b/arm_compute/graph/frontend/IStreamOperators.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/frontend/Layers.h b/arm_compute/graph/frontend/Layers.h
index 61a6fd4..6aeebb4 100644
--- a/arm_compute/graph/frontend/Layers.h
+++ b/arm_compute/graph/frontend/Layers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/frontend/Stream.h b/arm_compute/graph/frontend/Stream.h
index b52274e..db22f6d 100644
--- a/arm_compute/graph/frontend/Stream.h
+++ b/arm_compute/graph/frontend/Stream.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/frontend/SubStream.h b/arm_compute/graph/frontend/SubStream.h
index 3df7379..2283cfe 100644
--- a/arm_compute/graph/frontend/SubStream.h
+++ b/arm_compute/graph/frontend/SubStream.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/frontend/Types.h b/arm_compute/graph/frontend/Types.h
index 741412d..bc4fe7a 100644
--- a/arm_compute/graph/frontend/Types.h
+++ b/arm_compute/graph/frontend/Types.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h b/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
index 14a427b..cb1f079 100644
--- a/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
+++ b/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/mutators/GraphMutators.h b/arm_compute/graph/mutators/GraphMutators.h
index 6ae0699..155b332 100644
--- a/arm_compute/graph/mutators/GraphMutators.h
+++ b/arm_compute/graph/mutators/GraphMutators.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/mutators/GroupedConvolutionMutator.h b/arm_compute/graph/mutators/GroupedConvolutionMutator.h
index 01c9d0e..e68c703 100644
--- a/arm_compute/graph/mutators/GroupedConvolutionMutator.h
+++ b/arm_compute/graph/mutators/GroupedConvolutionMutator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/mutators/InPlaceOperationMutator.h b/arm_compute/graph/mutators/InPlaceOperationMutator.h
index 7932b62..6248d86 100644
--- a/arm_compute/graph/mutators/InPlaceOperationMutator.h
+++ b/arm_compute/graph/mutators/InPlaceOperationMutator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/mutators/NodeExecutionMethodMutator.h b/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
index 3de9406..07c8ffa 100644
--- a/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
+++ b/arm_compute/graph/mutators/NodeExecutionMethodMutator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/mutators/NodeFusionMutator.h b/arm_compute/graph/mutators/NodeFusionMutator.h
index b99ee79..f3e3eaa 100644
--- a/arm_compute/graph/mutators/NodeFusionMutator.h
+++ b/arm_compute/graph/mutators/NodeFusionMutator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h b/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
index c9747fd..b14ef59 100644
--- a/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
+++ b/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/mutators/SyntheticDataTypeMutator.h b/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
index 74f4b56..ed270f8 100644
--- a/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
+++ b/arm_compute/graph/mutators/SyntheticDataTypeMutator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/ActivationLayerNode.h b/arm_compute/graph/nodes/ActivationLayerNode.h
index 975bc8e..4a98ee2 100644
--- a/arm_compute/graph/nodes/ActivationLayerNode.h
+++ b/arm_compute/graph/nodes/ActivationLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/BatchNormalizationLayerNode.h b/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
index b50b955..e7f4049 100644
--- a/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
+++ b/arm_compute/graph/nodes/BatchNormalizationLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h b/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
index 062054e..57175eb 100644
--- a/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
+++ b/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/ChannelShuffleLayerNode.h b/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
index 830efb9..0696fe5 100644
--- a/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
+++ b/arm_compute/graph/nodes/ChannelShuffleLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/ConcatenateLayerNode.h b/arm_compute/graph/nodes/ConcatenateLayerNode.h
index 77ca6f6..8582403 100644
--- a/arm_compute/graph/nodes/ConcatenateLayerNode.h
+++ b/arm_compute/graph/nodes/ConcatenateLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/ConstNode.h b/arm_compute/graph/nodes/ConstNode.h
index 24dfaaa..b377c60 100644
--- a/arm_compute/graph/nodes/ConstNode.h
+++ b/arm_compute/graph/nodes/ConstNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/ConvolutionLayerNode.h b/arm_compute/graph/nodes/ConvolutionLayerNode.h
index eea43d7..e4151c0 100644
--- a/arm_compute/graph/nodes/ConvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/ConvolutionLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/DeconvolutionLayerNode.h b/arm_compute/graph/nodes/DeconvolutionLayerNode.h
index a5efdfb..e74adb1 100644
--- a/arm_compute/graph/nodes/DeconvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/DeconvolutionLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h b/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
index d8d36d9..59847a9 100644
--- a/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/DequantizationLayerNode.h b/arm_compute/graph/nodes/DequantizationLayerNode.h
index 8b3d4ad..4910938 100644
--- a/arm_compute/graph/nodes/DequantizationLayerNode.h
+++ b/arm_compute/graph/nodes/DequantizationLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/DetectionOutputLayerNode.h b/arm_compute/graph/nodes/DetectionOutputLayerNode.h
index 7732abf..b4b910c 100644
--- a/arm_compute/graph/nodes/DetectionOutputLayerNode.h
+++ b/arm_compute/graph/nodes/DetectionOutputLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h b/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
index 97d881d..6ff78ae 100644
--- a/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
+++ b/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/DummyNode.h b/arm_compute/graph/nodes/DummyNode.h
index e257641..645f1b3 100644
--- a/arm_compute/graph/nodes/DummyNode.h
+++ b/arm_compute/graph/nodes/DummyNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/EltwiseLayerNode.h b/arm_compute/graph/nodes/EltwiseLayerNode.h
index d619ad2..7a6d8e8 100644
--- a/arm_compute/graph/nodes/EltwiseLayerNode.h
+++ b/arm_compute/graph/nodes/EltwiseLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,6 +63,12 @@
      */
     ActivationLayerInfo fused_activation() const;
 
+    /** Returns output quantization info
+     *
+     * @return Output quantization info
+     */
+    QuantizationInfo output_quant_info() const;
+
     /** Sets fused activation
      *
      * @param[in] fused_activation Fused activation to set
@@ -80,6 +86,40 @@
 private:
     descriptors::EltwiseLayerDescriptor descriptor;
 };
+
+/** Unary Eltwise Layer node */
+class UnaryEltwiseLayerNode final : public INode
+{
+public:
+    /** Constructor
+     *
+     * @param[in] descriptor Containing information for the node described in @ref descriptors::EltwiseLayerDescriptor
+     */
+    UnaryEltwiseLayerNode(const descriptors::UnaryEltwiseLayerDescriptor &descriptor);
+    /** Unary eltwise layer descriptor
+     *
+     * @return Unary eltwise layer descriptor which containing information
+     */
+    descriptors::UnaryEltwiseLayerDescriptor eltwise_descriptor() const;
+
+    /** Sets fused activation
+     *
+     * @param[in] fused_activation Fused activation to set
+     */
+    void set_fused_activation(ActivationLayerInfo fused_activation);
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void accept(INodeVisitor &v) override;
+
+    static constexpr NodeType node_type = NodeType::UnaryEltwiseLayer;
+
+private:
+    descriptors::UnaryEltwiseLayerDescriptor descriptor;
+};
+
 } // namespace graph
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_GRAPH_ELTWISE_LAYER_NODE_H */

diff --git a/arm_compute/graph/nodes/FlattenLayerNode.h b/arm_compute/graph/nodes/FlattenLayerNode.h
index fd9a525..046114c 100644
--- a/arm_compute/graph/nodes/FlattenLayerNode.h
+++ b/arm_compute/graph/nodes/FlattenLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/FullyConnectedLayerNode.h b/arm_compute/graph/nodes/FullyConnectedLayerNode.h
index 10c310d..a7712f4 100644
--- a/arm_compute/graph/nodes/FullyConnectedLayerNode.h
+++ b/arm_compute/graph/nodes/FullyConnectedLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
index 62ec55e..b3661c3 100644
--- a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
+++ b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h b/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
index 668f09e..a01cb9d 100644
--- a/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
+++ b/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/GenerateProposalsLayerNode.h b/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
index 57cadf4..6f8edc8 100644
--- a/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
+++ b/arm_compute/graph/nodes/GenerateProposalsLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/InputNode.h b/arm_compute/graph/nodes/InputNode.h
index 20fc276..07091af 100644
--- a/arm_compute/graph/nodes/InputNode.h
+++ b/arm_compute/graph/nodes/InputNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/Nodes.h b/arm_compute/graph/nodes/Nodes.h
index e8852d2..bf4ab87 100644
--- a/arm_compute/graph/nodes/Nodes.h
+++ b/arm_compute/graph/nodes/Nodes.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/NodesFwd.h b/arm_compute/graph/nodes/NodesFwd.h
index 0ccf243..9541f4b 100644
--- a/arm_compute/graph/nodes/NodesFwd.h
+++ b/arm_compute/graph/nodes/NodesFwd.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/NormalizationLayerNode.h b/arm_compute/graph/nodes/NormalizationLayerNode.h
index 3f5e781..503b859 100644
--- a/arm_compute/graph/nodes/NormalizationLayerNode.h
+++ b/arm_compute/graph/nodes/NormalizationLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h b/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
index 843ab96..4d84c20 100644
--- a/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
+++ b/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/OutputNode.h b/arm_compute/graph/nodes/OutputNode.h
index 27902af..c91bc6b 100644
--- a/arm_compute/graph/nodes/OutputNode.h
+++ b/arm_compute/graph/nodes/OutputNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/PReluLayerNode.h b/arm_compute/graph/nodes/PReluLayerNode.h
index f2055c2..b8e6c1a 100644
--- a/arm_compute/graph/nodes/PReluLayerNode.h
+++ b/arm_compute/graph/nodes/PReluLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/PadLayerNode.h b/arm_compute/graph/nodes/PadLayerNode.h
index 852427a..8fcbc52 100644
--- a/arm_compute/graph/nodes/PadLayerNode.h
+++ b/arm_compute/graph/nodes/PadLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/PermuteLayerNode.h b/arm_compute/graph/nodes/PermuteLayerNode.h
index 555f2b5..0b2380b 100644
--- a/arm_compute/graph/nodes/PermuteLayerNode.h
+++ b/arm_compute/graph/nodes/PermuteLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/PoolingLayerNode.h b/arm_compute/graph/nodes/PoolingLayerNode.h
index 41342a8..b336bb9 100644
--- a/arm_compute/graph/nodes/PoolingLayerNode.h
+++ b/arm_compute/graph/nodes/PoolingLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/PrintLayerNode.h b/arm_compute/graph/nodes/PrintLayerNode.h
index 78b7bf2..b57ac1f 100644
--- a/arm_compute/graph/nodes/PrintLayerNode.h
+++ b/arm_compute/graph/nodes/PrintLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/PriorBoxLayerNode.h b/arm_compute/graph/nodes/PriorBoxLayerNode.h
index f6cfa47..c7eadd1 100644
--- a/arm_compute/graph/nodes/PriorBoxLayerNode.h
+++ b/arm_compute/graph/nodes/PriorBoxLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/QuantizationLayerNode.h b/arm_compute/graph/nodes/QuantizationLayerNode.h
index f283c26..94c718b 100644
--- a/arm_compute/graph/nodes/QuantizationLayerNode.h
+++ b/arm_compute/graph/nodes/QuantizationLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/ROIAlignLayerNode.h b/arm_compute/graph/nodes/ROIAlignLayerNode.h
index 4452468..5abd065 100644
--- a/arm_compute/graph/nodes/ROIAlignLayerNode.h
+++ b/arm_compute/graph/nodes/ROIAlignLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/ReorgLayerNode.h b/arm_compute/graph/nodes/ReorgLayerNode.h
index 86f6252..986692e 100644
--- a/arm_compute/graph/nodes/ReorgLayerNode.h
+++ b/arm_compute/graph/nodes/ReorgLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/ReshapeLayerNode.h b/arm_compute/graph/nodes/ReshapeLayerNode.h
index 57d399a..727d253 100644
--- a/arm_compute/graph/nodes/ReshapeLayerNode.h
+++ b/arm_compute/graph/nodes/ReshapeLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/ResizeLayerNode.h b/arm_compute/graph/nodes/ResizeLayerNode.h
index 93b4495..79f8889 100644
--- a/arm_compute/graph/nodes/ResizeLayerNode.h
+++ b/arm_compute/graph/nodes/ResizeLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/SliceLayerNode.h b/arm_compute/graph/nodes/SliceLayerNode.h
index 7f3173e..55f52a7 100644
--- a/arm_compute/graph/nodes/SliceLayerNode.h
+++ b/arm_compute/graph/nodes/SliceLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/SoftmaxLayerNode.h b/arm_compute/graph/nodes/SoftmaxLayerNode.h
index cbcd06a..0868c6f 100644
--- a/arm_compute/graph/nodes/SoftmaxLayerNode.h
+++ b/arm_compute/graph/nodes/SoftmaxLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/SplitLayerNode.h b/arm_compute/graph/nodes/SplitLayerNode.h
index 345260a..13cccdd 100644
--- a/arm_compute/graph/nodes/SplitLayerNode.h
+++ b/arm_compute/graph/nodes/SplitLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,10 +38,13 @@
 public:
     /** Default Constructor
      *
-     * @param[in] num_splits Number of splits
-     * @param[in] axis       (Optional) Axis to split on. Supported axis >= 2. Defaults to 0
+     * @param[in] num_splits  Number of splits
+     * @param[in] axis        (Optional) Axis to split on. Defaults to 0
+     * @param[in] size_splits (Optional) The sizes of each output tensor along the split dimension.
+     *                        Must sum to the dimension of value along split_dim.
+     *                        Can contain one -1 indicating that dimension is to be inferred.
      */
-    SplitLayerNode(unsigned int num_splits, unsigned int axis = 0);
+    SplitLayerNode(unsigned int num_splits, int axis = 0, std::vector<int> size_splits = std::vector<int>());
     /** Computes split layer output descriptor
      *
      * @param[in] input_descriptor Descriptor of the input tensor
@@ -51,8 +54,8 @@
      *
      * @return  A pair with the descriptor of the split and the starting coordinates
      */
-    static std::pair<TensorDescriptor, Coordinates> compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                              unsigned int num_splits, unsigned int axis, unsigned int idx);
+    std::pair<TensorDescriptor, Coordinates> compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                       unsigned int num_splits, int axis, unsigned int idx);
     /** Number of splits accessor
      *
      * @return Number of splits
@@ -72,8 +75,9 @@
     void accept(INodeVisitor &v) override;
 
 private:
-    unsigned int _num_splits;
-    unsigned int _axis;
+    unsigned int     _num_splits;
+    int              _axis;
+    std::vector<int> _size_splits;
 };
 } // namespace graph
 } // namespace arm_compute

diff --git a/arm_compute/graph/nodes/StackLayerNode.h b/arm_compute/graph/nodes/StackLayerNode.h
index 52632f5..2990895 100644
--- a/arm_compute/graph/nodes/StackLayerNode.h
+++ b/arm_compute/graph/nodes/StackLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/UpsampleLayerNode.h b/arm_compute/graph/nodes/UpsampleLayerNode.h
index cdaf206..8e43ac2 100644
--- a/arm_compute/graph/nodes/UpsampleLayerNode.h
+++ b/arm_compute/graph/nodes/UpsampleLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/nodes/YOLOLayerNode.h b/arm_compute/graph/nodes/YOLOLayerNode.h
index 22e0cf5..f9ced51 100644
--- a/arm_compute/graph/nodes/YOLOLayerNode.h
+++ b/arm_compute/graph/nodes/YOLOLayerNode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/printers/DotGraphPrinter.h b/arm_compute/graph/printers/DotGraphPrinter.h
index c763cb1..d39ddad 100644
--- a/arm_compute/graph/printers/DotGraphPrinter.h
+++ b/arm_compute/graph/printers/DotGraphPrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/graph/printers/Printers.h b/arm_compute/graph/printers/Printers.h
index 631b634..81ecce7 100644
--- a/arm_compute/graph/printers/Printers.h
+++ b/arm_compute/graph/printers/Printers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/Allocator.h b/arm_compute/runtime/Allocator.h
index db8818d..83f072a 100644
--- a/arm_compute/runtime/Allocator.h
+++ b/arm_compute/runtime/Allocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/Array.h b/arm_compute/runtime/Array.h
index bbf2395..817d97a 100644
--- a/arm_compute/runtime/Array.h
+++ b/arm_compute/runtime/Array.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/BlobLifetimeManager.h b/arm_compute/runtime/BlobLifetimeManager.h
index 7ef93eb..0d69f2e 100644
--- a/arm_compute/runtime/BlobLifetimeManager.h
+++ b/arm_compute/runtime/BlobLifetimeManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/BlobMemoryPool.h b/arm_compute/runtime/BlobMemoryPool.h
index 755694d..8481fa2 100644
--- a/arm_compute/runtime/BlobMemoryPool.h
+++ b/arm_compute/runtime/BlobMemoryPool.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLArray.h b/arm_compute/runtime/CL/CLArray.h
index 62b6171..76d0ee6 100644
--- a/arm_compute/runtime/CL/CLArray.h
+++ b/arm_compute/runtime/CL/CLArray.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLBufferAllocator.h b/arm_compute/runtime/CL/CLBufferAllocator.h
index f6e1b96..69eac21 100644
--- a/arm_compute/runtime/CL/CLBufferAllocator.h
+++ b/arm_compute/runtime/CL/CLBufferAllocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLDistribution1D.h b/arm_compute/runtime/CL/CLDistribution1D.h
index 7d832e9..0597582 100644
--- a/arm_compute/runtime/CL/CLDistribution1D.h
+++ b/arm_compute/runtime/CL/CLDistribution1D.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 007a40c..f909cc3 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,6 +100,7 @@
 #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
 #include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLMagnitude.h"
+#include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLMedian3x3.h"

diff --git a/arm_compute/runtime/CL/CLHOG.h b/arm_compute/runtime/CL/CLHOG.h
index a96f52a..7594f46 100644
--- a/arm_compute/runtime/CL/CLHOG.h
+++ b/arm_compute/runtime/CL/CLHOG.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLHelpers.h b/arm_compute/runtime/CL/CLHelpers.h
index 716d95e..9b71561 100644
--- a/arm_compute/runtime/CL/CLHelpers.h
+++ b/arm_compute/runtime/CL/CLHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLLut.h b/arm_compute/runtime/CL/CLLut.h
index a389284..8c48863 100644
--- a/arm_compute/runtime/CL/CLLut.h
+++ b/arm_compute/runtime/CL/CLLut.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLLutAllocator.h b/arm_compute/runtime/CL/CLLutAllocator.h
index 50b905f..169442c 100644
--- a/arm_compute/runtime/CL/CLLutAllocator.h
+++ b/arm_compute/runtime/CL/CLLutAllocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLMemory.h b/arm_compute/runtime/CL/CLMemory.h
index 5842ddf..7adee66 100644
--- a/arm_compute/runtime/CL/CLMemory.h
+++ b/arm_compute/runtime/CL/CLMemory.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLMemoryRegion.h b/arm_compute/runtime/CL/CLMemoryRegion.h
index 02ccc6b..9d08f19 100644
--- a/arm_compute/runtime/CL/CLMemoryRegion.h
+++ b/arm_compute/runtime/CL/CLMemoryRegion.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLMultiHOG.h b/arm_compute/runtime/CL/CLMultiHOG.h
index 45297c7..5b26467 100644
--- a/arm_compute/runtime/CL/CLMultiHOG.h
+++ b/arm_compute/runtime/CL/CLMultiHOG.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLMultiImage.h b/arm_compute/runtime/CL/CLMultiImage.h
index e76ee07..a12108c 100644
--- a/arm_compute/runtime/CL/CLMultiImage.h
+++ b/arm_compute/runtime/CL/CLMultiImage.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLPyramid.h b/arm_compute/runtime/CL/CLPyramid.h
index 696091e..573b0fd 100644
--- a/arm_compute/runtime/CL/CLPyramid.h
+++ b/arm_compute/runtime/CL/CLPyramid.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLRuntimeContext.h b/arm_compute/runtime/CL/CLRuntimeContext.h
index 54c7d3c..083ac0a 100644
--- a/arm_compute/runtime/CL/CLRuntimeContext.h
+++ b/arm_compute/runtime/CL/CLRuntimeContext.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 573248e..8a22832 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/ICLTuner.h"
 
 namespace arm_compute
@@ -72,6 +73,13 @@
      * @param[in] flush  (Optional) Specifies if the command queue will be flushed after running the kernel.
      */
     void enqueue(ICLKernel &kernel, bool flush = true);
+    /** Schedule the execution of the passed kernel if possible.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] tensors Vector containing the tensors to operate on.
+     * @param[in] flush   (Optional) Specifies if the command queue will be flushed after running the kernel.
+     */
+    void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush = true);
 
     /** Initialises the context and command queue to be used by the scheduler.
      *
@@ -143,6 +151,7 @@
     bool is_initialised() const;
 
 private:
+    void enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush);
     /** Flag to ensure symbols initialisation is happening before Scheduler creation */
     static std::once_flag _initialize_symbols;
 

diff --git a/arm_compute/runtime/CL/CLSubTensor.h b/arm_compute/runtime/CL/CLSubTensor.h
index 98dd28f..0a7f5f8 100644
--- a/arm_compute/runtime/CL/CLSubTensor.h
+++ b/arm_compute/runtime/CL/CLSubTensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLTensor.h b/arm_compute/runtime/CL/CLTensor.h
index 102cb36..61a7dfa 100644
--- a/arm_compute/runtime/CL/CLTensor.h
+++ b/arm_compute/runtime/CL/CLTensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
index 7c1bc03..c978dcd 100644
--- a/arm_compute/runtime/CL/CLTensorAllocator.h
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 3b7a42f..3b45a21 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -116,6 +116,7 @@
     // Inherited methods overridden:
     void tune_kernel_static(ICLKernel &kernel) override;
     void tune_kernel_dynamic(ICLKernel &kernel) override;
+    void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override;
 
     /** Is the kernel_event set ?
      *
@@ -126,11 +127,12 @@
 private:
     /** Find optimal LWS using brute-force approach
      *
-     * @param[in] kernel OpenCL kernel to be tuned with LWS
+     * @param[in]     kernel  OpenCL kernel to be tuned with LWS
+     * @param[in,out] tensors Tensors for the kernel to operate on
      *
      * @return The optimal LWS to use
      */
-    cl::NDRange find_optimal_lws(ICLKernel &kernel);
+    cl::NDRange find_optimal_lws(ICLKernel &kernel, ITensorPack &tensors);
 
     std::unordered_map<std::string, cl::NDRange> _lws_table;
     cl::Event   _kernel_event;

diff --git a/arm_compute/runtime/CL/CLTunerTypes.h b/arm_compute/runtime/CL/CLTunerTypes.h
index a3385b6..e3180f2 100644
--- a/arm_compute/runtime/CL/CLTunerTypes.h
+++ b/arm_compute/runtime/CL/CLTunerTypes.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/CLTypes.h b/arm_compute/runtime/CL/CLTypes.h
index 48697af..cbc5253 100644
--- a/arm_compute/runtime/CL/CLTypes.h
+++ b/arm_compute/runtime/CL/CLTypes.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/ICLGEMMKernelSelection.h b/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
index 69b9411..7be9393 100644
--- a/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
+++ b/arm_compute/runtime/CL/ICLGEMMKernelSelection.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/ICLOperator.h b/arm_compute/runtime/CL/ICLOperator.h
new file mode 100644
index 0000000..526b7e9
--- /dev/null
+++ b/arm_compute/runtime/CL/ICLOperator.h

@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICLOPERATOR_H
+#define ARM_COMPUTE_ICLOPERATOR_H
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Types.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+/** Basic interface for functions which have a single async CL kernel */
+class ICLOperator : public IOperator
+{
+public:
+    /** Constructor
+     *
+     * @param[in] ctx Runtime context to be used by the function
+     */
+    ICLOperator(IRuntimeContext *ctx = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLOperator(const ICLOperator &) = delete;
+    /** Default move constructor */
+    ICLOperator(ICLOperator &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLOperator &operator=(const ICLOperator &) = delete;
+    /** Default move assignment operator */
+    ICLOperator &operator=(ICLOperator &&) = default;
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &constants) override;
+    MemoryRequirements workspace() const override;
+
+protected:
+    std::unique_ptr<ICLKernel> _kernel;
+    IRuntimeContext           *_ctx;
+    MemoryRequirements         _workspace;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_ICLOPERATOR_H */

diff --git a/arm_compute/runtime/CL/ICLSimpleFunction.h b/arm_compute/runtime/CL/ICLSimpleFunction.h
index 5fc956d..4b1d5b1 100644
--- a/arm_compute/runtime/CL/ICLSimpleFunction.h
+++ b/arm_compute/runtime/CL/ICLSimpleFunction.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h
index ba4eeac..0f951c3 100644
--- a/arm_compute/runtime/CL/ICLTuner.h
+++ b/arm_compute/runtime/CL/ICLTuner.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_ICLTUNER_H
 #define ARM_COMPUTE_ICLTUNER_H
 
+#include "arm_compute/core/experimental/Types.h"
+
 namespace arm_compute
 {
 class ICLKernel;
@@ -49,6 +51,12 @@
      * @param[in] kernel Kernel to tune
      */
     virtual void tune_kernel_dynamic(ICLKernel &kernel) = 0;
+    /** Tune OpenCL kernel dynamically
+     *
+     * @param[in]      kernel  Kernel to tune
+     * @param[in, out] tensors Tensors for the kernel to use
+     */
+    virtual void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) = 0;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLTUNER_H */

diff --git a/arm_compute/runtime/CL/Utils.h b/arm_compute/runtime/CL/Utils.h
index e317569..c699213 100644
--- a/arm_compute/runtime/CL/Utils.h
+++ b/arm_compute/runtime/CL/Utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
index 26aded6..b0f1948 100644
--- a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
+++ b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLAccumulate.h b/arm_compute/runtime/CL/functions/CLAccumulate.h
index b47f0c0..9dbf13b 100644
--- a/arm_compute/runtime/CL/functions/CLAccumulate.h
+++ b/arm_compute/runtime/CL/functions/CLAccumulate.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
index fbb34e5..632487c 100644
--- a/arm_compute/runtime/CL/functions/CLActivationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CLACTIVATIONLAYER_H
 #define ARM_COMPUTE_CLACTIVATIONLAYER_H
 
+#include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 #include "arm_compute/core/Types.h"
@@ -36,7 +37,7 @@
  *
  * @note The function simulates an activation layer with the specified activation function.
  */
-class CLActivationLayer : public ICLSimpleFunction
+class CLActivationLayer : public IFunction
 {
 public:
     /** Constructor
@@ -44,14 +45,16 @@
      * @param[in] ctx Runtime context to be used by the function
      */
     CLActivationLayer(CLRuntimeContext *ctx = nullptr);
+    /** Destructor */
+    ~CLActivationLayer();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLActivationLayer(const CLActivationLayer &) = delete;
     /** Default move constructor */
-    CLActivationLayer(CLActivationLayer &&) = default;
+    CLActivationLayer(CLActivationLayer &&);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLActivationLayer &operator=(const CLActivationLayer &) = delete;
     /** Default move assignment operator */
-    CLActivationLayer &operator=(CLActivationLayer &&) = default;
+    CLActivationLayer &operator=(CLActivationLayer &&);
     /** Set the input and output tensor.
      *
      * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
@@ -83,6 +86,41 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
+
+namespace experimental
+{
+/** Basic function to run @ref CLActivationLayerKernel */
+class CLActivation : public ICLOperator
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input           Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result
+     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[out]     output          Destination tensor info. Data type supported: same as @p input
+     * @param[in]      act_info        Activation layer parameters.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *output, ActivationLayerInfo act_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayer
+     *
+     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
+     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[in] output   Destination tensor info. Data type supported: same as @p input
+     * @param[in] act_info Activation layer information.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+};
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLACTIVATIONLAYER_H */

diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
index b0d29bc..dc0c37e 100644
--- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_CLARGMINMAXLAYER_H
 
 #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -55,7 +55,7 @@
     CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Input source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input  Input source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
      * @param[in]  axis   Axis to find max/min index.
      * @param[out] output Output source tensor. Data types supported: U32/S32.
      * @param[in]  op     Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN
@@ -64,7 +64,7 @@
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input           Input source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
      * @param[in]  axis            Axis to find max/min index.
      * @param[out] output          Output source tensor. Data types supported: U32/S32.
      * @param[in]  op              Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN
@@ -72,7 +72,7 @@
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayer
      *
-     * @param[in] input  Input source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input  Input source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
      * @param[in] axis   Axis to find max/min index.
      * @param[in] output Output source tensor info. Data types supported: U32/S32.
      * @param[in] op     Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN
@@ -89,7 +89,7 @@
     std::vector<CLTensor>               _results_vector;
     CLTensor                            _not_reshaped_output;
     std::vector<CLArgMinMaxLayerKernel> _reduction_kernels_vector;
-    CLReshapeLayerKernel                _reshape_kernel;
+    CLReshapeLayer                      _reshape;
     unsigned int                        _num_of_stages;
     unsigned int                        _reduction_axis;
 };

diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
index a211ea6..c22991d 100644
--- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
index 6edb464..ba57921 100644
--- a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
index 1faded0..3c28938 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
index c946055..4c21d56 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseNot.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
index 4fb93cc..8a48173 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseOr.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
index 6caa013..6928e59 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseXor.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
index b09359d..5e4e890 100644
--- a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
+++ b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLBox3x3.h b/arm_compute/runtime/CL/functions/CLBox3x3.h
index a4cf4d2..2d2aa47 100644
--- a/arm_compute/runtime/CL/functions/CLBox3x3.h
+++ b/arm_compute/runtime/CL/functions/CLBox3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLCannyEdge.h b/arm_compute/runtime/CL/functions/CLCannyEdge.h
index 2729d24..f9d9f8f 100644
--- a/arm_compute/runtime/CL/functions/CLCannyEdge.h
+++ b/arm_compute/runtime/CL/functions/CLCannyEdge.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h
index 6a1835c..592368d 100644
--- a/arm_compute/runtime/CL/functions/CLCast.h
+++ b/arm_compute/runtime/CL/functions/CLCast.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLChannelCombine.h b/arm_compute/runtime/CL/functions/CLChannelCombine.h
index 474830d..4e3d10c 100644
--- a/arm_compute/runtime/CL/functions/CLChannelCombine.h
+++ b/arm_compute/runtime/CL/functions/CLChannelCombine.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLChannelExtract.h b/arm_compute/runtime/CL/functions/CLChannelExtract.h
index aa25516..cf042b4 100644
--- a/arm_compute/runtime/CL/functions/CLChannelExtract.h
+++ b/arm_compute/runtime/CL/functions/CLChannelExtract.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
index 183a2f1e..e0bb3d0 100644
--- a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
+++ b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLColorConvert.h b/arm_compute/runtime/CL/functions/CLColorConvert.h
index 8721e8a..e4017c2 100644
--- a/arm_compute/runtime/CL/functions/CLColorConvert.h
+++ b/arm_compute/runtime/CL/functions/CLColorConvert.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLComparison.h b/arm_compute/runtime/CL/functions/CLComparison.h
index 4e681e7..c6d61e4 100644
--- a/arm_compute/runtime/CL/functions/CLComparison.h
+++ b/arm_compute/runtime/CL/functions/CLComparison.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -79,7 +79,7 @@
 public:
     /** Initialise the kernel's inputs and outputs.
      *
-     * @param[in]  input1 Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input1 Source tensor. Data types supported: All.
      *                    The input1 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in]  input2 Source tensor. Data types supported: Same as @p input1.
      *                    The input2 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -92,7 +92,7 @@
     /** Initialise the kernel's inputs and outputs.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input1          Source tensor. Data types supported: All.
      *                             The input1 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in]  input2          Source tensor. Data types supported: Same as @p input1.
      *                             The input2 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -101,7 +101,7 @@
     void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparison
      *
-     * @param[in] input1 Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in] input1 Source tensor. Data types supported: All.
      * @param[in] input2 Source tensor. Data types supported: Same as @p input1.
      * @param[in] output Destination tensor. Data types supported: U8.
      *

diff --git a/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h b/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h
index 15c5bfe..a2f1a4e 100644
--- a/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h
+++ b/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index b8e3361..f535c8e 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CLCONCATENATELAYER_H
 #define ARM_COMPUTE_CLCONCATENATELAYER_H
 
+#include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include "arm_compute/core/CL/ICLKernel.h"
@@ -51,16 +52,25 @@
 public:
     /** Default constructor */
     CLConcatenateLayer();
+    /** Destructor */
+    ~CLConcatenateLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConcatenateLayer(const CLConcatenateLayer &) = delete;
+    /** Default move constructor */
+    CLConcatenateLayer(CLConcatenateLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConcatenateLayer &operator=(const CLConcatenateLayer &) = delete;
+    /** Default move assignment operator */
+    CLConcatenateLayer &operator=(CLConcatenateLayer &&);
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
      * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
      *
-     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All.
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
      * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
     void configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
     /** Initialise the kernel's inputs vector and output.
      *
@@ -68,11 +78,10 @@
      * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
      *
      * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] inputs_vector   The vectors containing all the tensors to concatenate. Data types supported: All.
+     * @param[in,out] inputs_vector   The vectors containing all the tensors to concatenate. Data types supported: All
      * @param[out]    output          Output tensor. Data types supported: Same as @p input.
      * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(const CLCompileContext &compile_context, std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
     void configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
      *
@@ -85,22 +94,63 @@
      *
      * @return a status
      */
-    static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
     static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    template <typename TensorType>
-    void configure_internal(const CLCompileContext &compile_context, std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis);
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
 
-    template <typename TensorInfoType>
-    static Status validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis);
+namespace experimental
+{
+/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
+ *
+ * -# @ref CLWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
+ * -# @ref CLHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
+ * -# @ref CLDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
+ * -# @ref CLBatchConcatenateLayerKernel (if underlying concatenation axis is 3).
+ */
+class CLConcatenation : public ICLOperator
+{
+public:
+    /** Default constructor */
+    CLConcatenation();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     *
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] inputs_vector   The vectors containing all the tensors to concatenate. Data types supported: All
+     * @param[out]    output          Output tensor. Data types supported: Same as @p input.
+     * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     */
+    void configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &inputs_vector, ITensorInfo *output, size_t axis);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEConcatenateLayer
+     *
+     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     *
+     * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: All
+     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
+     * @param[in] axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     *
+     * @return a status
+     */
+    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
     std::vector<std::unique_ptr<ICLKernel>> _concat_kernels;
     unsigned int                            _num_inputs;
     unsigned int                            _axis;
 };
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCONCATENATELAYER_H */

diff --git a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
index 123f638..9298be2 100644
--- a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h
index 72ef8ce..c06ad0d 100644
--- a/arm_compute/runtime/CL/functions/CLConvolution.h
+++ b/arm_compute/runtime/CL/functions/CLConvolution.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index fff9173..ac36523 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLCopy.h b/arm_compute/runtime/CL/functions/CLCopy.h
index 31b73c3..c20d75e 100644
--- a/arm_compute/runtime/CL/functions/CLCopy.h
+++ b/arm_compute/runtime/CL/functions/CLCopy.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@
 public:
     /** Initialise the function's source and destination.
      *
-     * @param[in]  input  Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in]  input  Source tensor. Data types supported: All.
      * @param[out] output Output tensor. Data types supported: Same as @p input.
      *
      */
@@ -46,14 +46,14 @@
     /** Initialise the function's source and destination.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in]  input           Source tensor. Data types supported: All.
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
      *
      */
     void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCopy
      *
-     * @param[in] input  Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in] input  Source tensor. Data types supported: All.
      * @param[in] output Output tensor. Data types supported: Same as @p input.
      *
      * @return a status

diff --git a/arm_compute/runtime/CL/functions/CLCropResize.h b/arm_compute/runtime/CL/functions/CLCropResize.h
index 86df0d4..e940928 100644
--- a/arm_compute/runtime/CL/functions/CLCropResize.h
+++ b/arm_compute/runtime/CL/functions/CLCropResize.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,7 +62,7 @@
      * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used.
      * @note Start and end indices of boxes are inclusive.
      *
-     * @param[in]  input               Source tensor containing N batches of 3D images to be cropped. Data type supported: : U16/S16/U32/S32/F16/F32
+     * @param[in]  input               Source tensor containing N batches of 3D images to be cropped. Data type supported: All
      * @param[in]  boxes               Tensor containing the boxes used to crop the images. It has to be known before configuration. Data type supported: F32
      * @param[in]  box_ind             One dimensional tensor containing the batch index of the 3D image in @p input that the corresponding
      *                                 box in @p boxes will be applied to. It has to be known before configuration. Data type supported: F32
@@ -80,7 +80,7 @@
      * @note Start and end indices of boxes are inclusive.
      *
      * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  input               Source tensor containing N batches of 3D images to be cropped. Data type supported: U16/S16/U32/S32/F16/F32
+     * @param[in]  input               Source tensor containing N batches of 3D images to be cropped. Data type supported: All
      * @param[in]  boxes               Tensor containing the boxes used to crop the images. It has to be known before configuration. Data type supported: F32
      * @param[in]  box_ind             One dimensional tensor containing the batch index of the 3D image in @p input that the corresponding
      *                                 box in @p boxes will be applied to. It has to be known before configuration. Data type supported: F32
@@ -98,7 +98,7 @@
      * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used.
      * @note Start and end indices of boxes are inclusive.
      *
-     * @param[in] input               Source tensor info containing N batches of 3D images to be cropped. Data type supported: U16/S16/U32/S32/F16/F32
+     * @param[in] input               Source tensor info containing N batches of 3D images to be cropped. Data type supported: All
      * @param[in] boxes               Tensor info for the tensor containing the boxes used to crop the images. Data type supported: F32
      * @param[in] box_ind             Tensor info for the one dimensional tensor containing the batch index of the 3D image in @p input
      *                                that the corresponding box in @p boxes will be applied to. Data type supported: F32

diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index c75b586..df3cad6 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index 2d3dde1..19a44f7 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
index 910b9ea..d125584 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
index dbf5898..5e197cb 100644
--- a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index f15271b..570b6ca 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
index c0a0fcd..88ed915 100644
--- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLDerivative.h b/arm_compute/runtime/CL/functions/CLDerivative.h
index 5875ceb..1aba6a9 100644
--- a/arm_compute/runtime/CL/functions/CLDerivative.h
+++ b/arm_compute/runtime/CL/functions/CLDerivative.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLDilate.h b/arm_compute/runtime/CL/functions/CLDilate.h
index cc84820..adb9cf4 100644
--- a/arm_compute/runtime/CL/functions/CLDilate.h
+++ b/arm_compute/runtime/CL/functions/CLDilate.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
index 0c81ffa..8107fa2 100644
--- a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
index 1fed460..232b9f5 100644
--- a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h b/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h
index 19729b6..5208bfe 100644
--- a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h
+++ b/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H
 #define ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H
 
+#include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
@@ -31,9 +32,21 @@
 class ICLTensor;
 
 /** Basic function to perform inverse square root on an input tensor. */
-class CLRsqrtLayer : public ICLSimpleFunction
+class CLRsqrtLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLRsqrtLayer();
+    /** Default Destructor */
+    ~CLRsqrtLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRsqrtLayer(const CLRsqrtLayer &) = delete;
+    /** Default move constructor */
+    CLRsqrtLayer(CLRsqrtLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRsqrtLayer &operator=(const CLRsqrtLayer &) = delete;
+    /** Default move assignment operator */
+    CLRsqrtLayer &operator=(CLRsqrtLayer &&);
     /** Initialize the function
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
@@ -55,12 +68,31 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to perform exponential on an input tensor. */
-class CLExpLayer : public ICLSimpleFunction
+class CLExpLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLExpLayer();
+    /** Default Destructor */
+    ~CLExpLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLExpLayer(const CLExpLayer &) = delete;
+    /** Default move constructor */
+    CLExpLayer(CLExpLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLExpLayer &operator=(const CLExpLayer &) = delete;
+    /** Default move assignment operator */
+    CLExpLayer &operator=(CLExpLayer &&);
     /** Initialize the function
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
@@ -82,12 +114,31 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to negate an input tensor. */
-class CLNegLayer : public ICLSimpleFunction
+class CLNegLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLNegLayer();
+    /** Default Destructor */
+    ~CLNegLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNegLayer(const CLNegLayer &) = delete;
+    /** Default move constructor */
+    CLNegLayer(CLNegLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNegLayer &operator=(const CLNegLayer &) = delete;
+    /** Default move assignment operator */
+    CLNegLayer &operator=(CLNegLayer &&);
     /** Initialize the function
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
@@ -109,12 +160,31 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to calculate sine of an input tensor. */
-class CLSinLayer : public ICLSimpleFunction
+class CLSinLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLSinLayer();
+    /** Default Destructor */
+    ~CLSinLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSinLayer(const CLSinLayer &) = delete;
+    /** Default move constructor */
+    CLSinLayer(CLSinLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSinLayer &operator=(const CLSinLayer &) = delete;
+    /** Default move assignment operator */
+    CLSinLayer &operator=(CLSinLayer &&);
     /** Initialize the function
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
@@ -136,12 +206,31 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to perform elementwise log on an input tensor. */
-class CLLogLayer : public ICLSimpleFunction
+class CLLogLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLLogLayer();
+    /** Default Destructor */
+    ~CLLogLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogLayer(const CLLogLayer &) = delete;
+    /** Default move constructor */
+    CLLogLayer(CLLogLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogLayer &operator=(const CLLogLayer &) = delete;
+    /** Default move assignment operator */
+    CLLogLayer &operator=(CLLogLayer &&);
     /** Initialize the function
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
@@ -163,12 +252,31 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to get the absolute value of an input tensor. */
-class CLAbsLayer : public ICLSimpleFunction
+class CLAbsLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLAbsLayer();
+    /** Default Destructor */
+    ~CLAbsLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLAbsLayer(const CLAbsLayer &) = delete;
+    /** Default move constructor */
+    CLAbsLayer(CLAbsLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLAbsLayer &operator=(const CLAbsLayer &) = delete;
+    /** Default move assignment operator */
+    CLAbsLayer &operator=(CLAbsLayer &&);
     /** Initialize the function
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
@@ -190,12 +298,31 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to get the round (to the nearest even) value of an input tensor. */
-class CLRoundLayer : public ICLSimpleFunction
+class CLRoundLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLRoundLayer();
+    /** Default Destructor */
+    ~CLRoundLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRoundLayer(const CLRoundLayer &) = delete;
+    /** Default move constructor */
+    CLRoundLayer(CLRoundLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRoundLayer &operator=(const CLRoundLayer &) = delete;
+    /** Default move assignment operator */
+    CLRoundLayer &operator=(CLRoundLayer &&);
     /** Initialize the function
      *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
@@ -217,6 +344,205 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
+
+namespace experimental
+{
+/** Basic function to perform inverse square root on an input tensor. */
+class CLRsqrt : public ICLOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input  Input tensor info. Data types supported: F16/F32.
+     * @param[out] output Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data types supported: F16/F32.
+     * @param[out] output          Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLRsqrtLayer
+     *
+     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to perform exponential on an input tensor. */
+class CLExp : public ICLOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input  Input tensor info. Data types supported: F16/F32.
+     * @param[out] output Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data types supported: F16/F32.
+     * @param[out] output          Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLExpLayer
+     *
+     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to negate an input tensor. */
+class CLNeg : public ICLOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input  Input tensor info. Data types supported: F16/F32.
+     * @param[out] output Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data types supported: F16/F32.
+     * @param[out] output          Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLNegLayer
+     *
+     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to calculate sine of an input tensor. */
+class CLSin : public ICLOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input  Input tensor info. Data types supported: F16/F32.
+     * @param[out] output Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data types supported: F16/F32.
+     * @param[out] output          Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSinLayer
+     *
+     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to perform elementwise log on an input tensor. */
+class CLLog : public ICLOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input  Input tensor info. Data types supported: F16/F32.
+     * @param[out] output Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data types supported: F16/F32.
+     * @param[out] output          Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLLogLayer
+     *
+     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to get the absolute value of an input tensor. */
+class CLAbs : public ICLOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input  Input tensor info. Data types supported: F16/F32.
+     * @param[out] output Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data types supported: F16/F32.
+     * @param[out] output          Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLAbsLayer
+     *
+     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+
+/** Basic function to get the round (to the nearest even) value of an input tensor. */
+class CLRound : public ICLOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  input  Input tensor info. Data types supported: F16/F32.
+     * @param[out] output Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data types supported: F16/F32.
+     * @param[out] output          Output tensor info. Data types supported: same as @p input.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLRoundLayer
+     *
+     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H */

diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
index 8c656ed..2d9d438 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,99 +24,154 @@
 #ifndef ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H
 #define ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
+namespace experimental
+{
 /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for addition
  *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
  * @note The function performs an arithmetic addition between two tensors.
  */
-class CLArithmeticAddition : public ICLSimpleFunction
+class CLArithmeticAddition : public ICLOperator
 {
 public:
+    /** Default Constructor */
+    CLArithmeticAddition();
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
-     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2   Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QSYMM16 (only if @p input1 is QSYMM16), S16/F16/F32.
-     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output   Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), QSYMM16 (only if both inputs is QSYMM16), S16/F16/F32.
-     * @param[in]      policy   Policy to use to handle overflow.
-     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Initialise the kernel's inputs, output and conversion policy.
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QSYMM16 (only if @p input1 is QSYMM16), S16/F16/F32.
+     * @param[in, out] input2          Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), QSYMM16 (only if both inputs is QSYMM16), S16/F16/F32.
+     * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for addition
      *
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QSYMM16 (only if @p input1 is QSYMM16), S16/F16/F32.
-     * @param[in] output   Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), QSYMM16 (only if both inputs is QSYMM16), S16/F16/F32.
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      * @param[in] policy   Policy to use to handle overflow.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    CLFillBorderKernel _border_handler;
 };
 
 /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for subtraction
  *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/U32/F16/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32.
  * @note The function performs an arithmetic subtraction between two tensors.
  */
-class CLArithmeticSubtraction : public ICLSimpleFunction
+class CLArithmeticSubtraction : public ICLOperator
 {
 public:
+    /** Default Constructor */
+    CLArithmeticSubtraction();
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/S16/S32/U32/F16/F32.
-     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2   Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
-     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output   Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16/F32.
-     * @param[in]      policy   Policy to use to handle overflow.
-     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Initialise the kernel's inputs, output and conversion policy.
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/S16/S32/U32/F16/F32.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
+     * @param[in, out] input2          Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16/F32.
+     * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for subtraction
      *
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/S16/S32/U32/F16/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
-     * @param[in] output   Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 ( only if both inputs are QASYMM8), S16/F16/F32.
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      * @param[in] policy   Policy to use to handle overflow.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    CLFillBorderKernel _border_handler;
 };
 
 /** Basic function to run @ref CLSaturatedArithmeticOperationKernel for division
@@ -124,9 +179,429 @@
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs an arithmetic division between two tensors.
  */
-class CLArithmeticDivision : public ICLSimpleFunction
+class CLArithmeticDivision : public ICLOperator
 {
 public:
+    /** Default Constructor */
+    CLArithmeticDivision();
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: Same as @p input1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision
+     *
+     * @param[in] input1   First tensor input info. Data types supported: F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+
+/** Basic function to run @ref CLArithmeticOperationKernel for max
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class CLElementwiseMax : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLElementwiseMax();
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for max
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+
+/** Basic function to run @ref CLArithmeticOperationKernel for min
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class CLElementwiseMin : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLElementwiseMin();
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for min
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+
+/** Basic function to run @ref CLArithmeticOperationKernel for squared difference
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32.
+ * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
+ */
+class CLElementwiseSquaredDiff : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLElementwiseSquaredDiff();
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for squared difference
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+
+/** Basic function to run @ref CLArithmeticOperationKernel for power
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
+ */
+class CLElementwisePower : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLElementwisePower();
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported:F16/F32.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for power
+     *
+     * @param[in] input1   First tensor input info. Data types supported: F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: F16/F32.
+     * @param[in] output   Output tensor info. Data types supported: F16/F32.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+} // namespace experimental
+
+/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for addition
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @note The function performs an arithmetic addition between two tensors.
+ */
+class CLArithmeticAddition : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLArithmeticAddition();
+    /** Default Destructor */
+    ~CLArithmeticAddition();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticAddition(const CLArithmeticAddition &) = delete;
+    /** Default move constructor */
+    CLArithmeticAddition(CLArithmeticAddition &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticAddition &operator=(const CLArithmeticAddition &) = delete;
+    /** Default move assignment operator */
+    CLArithmeticAddition &operator=(CLArithmeticAddition &&);
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy   Policy to use to handle overflow.
+     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy          Policy to use to handle overflow.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for addition
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] policy   Policy to use to handle overflow.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for subtraction
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32.
+ * @note The function performs an arithmetic subtraction between two tensors.
+ */
+class CLArithmeticSubtraction : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLArithmeticSubtraction();
+    /** Default Destructor */
+    ~CLArithmeticSubtraction();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticSubtraction(const CLArithmeticSubtraction &) = delete;
+    /** Default move constructor */
+    CLArithmeticSubtraction(CLArithmeticSubtraction &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticSubtraction &operator=(const CLArithmeticSubtraction &) = delete;
+    /** Default move assignment operator */
+    CLArithmeticSubtraction &operator=(CLArithmeticSubtraction &&);
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy   Policy to use to handle overflow.
+     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy          Policy to use to handle overflow.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for subtraction
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] policy   Policy to use to handle overflow.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for division
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs an arithmetic division between two tensors.
+ */
+class CLArithmeticDivision : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLArithmeticDivision();
+    /** Default Destructor */
+    ~CLArithmeticDivision();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticDivision(const CLArithmeticDivision &) = delete;
+    /** Default move constructor */
+    CLArithmeticDivision(CLArithmeticDivision &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticDivision &operator=(const CLArithmeticDivision &) = delete;
+    /** Default move assignment operator */
+    CLArithmeticDivision &operator=(CLArithmeticDivision &&);
     /** Initialise the kernel's inputs, output.
      *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
@@ -147,7 +622,7 @@
      * @param[out]     output          Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -158,6 +633,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for max
@@ -165,40 +647,59 @@
  * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
  * @note The function performs a max operation between two tensors.
  */
-class CLElementwiseMax : public ICLSimpleFunction
+class CLElementwiseMax : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwiseMax();
+    /** Default Destructor */
+    ~CLElementwiseMax();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMax(const CLElementwiseMax &) = delete;
+    /** Default move constructor */
+    CLElementwiseMax(CLElementwiseMax &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMax &operator=(const CLElementwiseMax &) = delete;
+    /** Default move assignment operator */
+    CLElementwiseMax &operator=(CLElementwiseMax &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2   Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2   Second tensor input. Data types supported: same as @p input1.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output   Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
     void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
     void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for max
      *
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
-     * @param[in] output   Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 ( only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for min
@@ -206,40 +707,59 @@
  * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
  * @note The function performs a max operation between two tensors.
  */
-class CLElementwiseMin : public ICLSimpleFunction
+class CLElementwiseMin : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwiseMin();
+    /** Default Destructor */
+    ~CLElementwiseMin();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMin(const CLElementwiseMin &) = delete;
+    /** Default move constructor */
+    CLElementwiseMin(CLElementwiseMin &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMin &operator=(const CLElementwiseMin &) = delete;
+    /** Default move assignment operator */
+    CLElementwiseMin &operator=(CLElementwiseMin &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2   Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2   Second tensor input. Data types supported: same as @p input1.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output   Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
     void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
     void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for min
      *
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
-     * @param[in] output   Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 ( only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for squared difference
@@ -247,40 +767,59 @@
  * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32.
  * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
  */
-class CLElementwiseSquaredDiff : public ICLSimpleFunction
+class CLElementwiseSquaredDiff : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwiseSquaredDiff();
+    /** Default Destructor */
+    ~CLElementwiseSquaredDiff();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseSquaredDiff(const CLElementwiseSquaredDiff &) = delete;
+    /** Default move constructor */
+    CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseSquaredDiff &operator=(const CLElementwiseSquaredDiff &) = delete;
+    /** Default move assignment operator */
+    CLElementwiseSquaredDiff &operator=(CLElementwiseSquaredDiff &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
+     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2   Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2   Second tensor input. Data types supported: same as @p input1.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output   Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
     void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
     void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for squared difference
      *
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
-     * @param[in] output   Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 ( only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref CLArithmeticOperationKernel for power
@@ -288,9 +827,21 @@
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
  */
-class CLElementwisePower : public ICLSimpleFunction
+class CLElementwisePower : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwisePower();
+    /** Default Destructor */
+    ~CLElementwisePower();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwisePower(const CLElementwisePower &) = delete;
+    /** Default move constructor */
+    CLElementwisePower(CLElementwisePower &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwisePower &operator=(const CLElementwisePower &) = delete;
+    /** Default move assignment operator */
+    CLElementwisePower &operator=(CLElementwisePower &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
@@ -322,6 +873,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H */

diff --git a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
index d907cfb..883f330 100644
--- a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
+++ b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLErode.h b/arm_compute/runtime/CL/functions/CLErode.h
index 57f701c..f8f1c72 100644
--- a/arm_compute/runtime/CL/functions/CLErode.h
+++ b/arm_compute/runtime/CL/functions/CLErode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLFFT1D.h b/arm_compute/runtime/CL/functions/CLFFT1D.h
index da15322..a6a35ab 100644
--- a/arm_compute/runtime/CL/functions/CLFFT1D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT1D.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLFFT2D.h b/arm_compute/runtime/CL/functions/CLFFT2D.h
index a113f20..9ceebea 100644
--- a/arm_compute/runtime/CL/functions/CLFFT2D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT2D.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
index 7407319..53ce633 100644
--- a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLFastCorners.h b/arm_compute/runtime/CL/functions/CLFastCorners.h
index 1dc87d6..698cc67 100644
--- a/arm_compute/runtime/CL/functions/CLFastCorners.h
+++ b/arm_compute/runtime/CL/functions/CLFastCorners.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLFill.h b/arm_compute/runtime/CL/functions/CLFill.h
index bb12160..b79b234 100644
--- a/arm_compute/runtime/CL/functions/CLFill.h
+++ b/arm_compute/runtime/CL/functions/CLFill.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,14 +38,14 @@
 public:
     /** Initialize the function
      *
-     * @param[in,out] tensor         Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in,out] tensor         Source tensor. Data types supported: All.
      * @param[in]     constant_value Constant value to use to fill tensor.
      */
     void configure(ICLTensor *tensor, PixelValue constant_value);
     /** Initialize the function
      *
      * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] tensor          Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in,out] tensor          Source tensor. Data types supported: All.
      * @param[in]     constant_value  Constant value to use to fill tensor.
      */
     void configure(const CLCompileContext &compile_context, ICLTensor *tensor, PixelValue constant_value);

diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h
index 250806b..18bc20e 100644
--- a/arm_compute/runtime/CL/functions/CLFillBorder.h
+++ b/arm_compute/runtime/CL/functions/CLFillBorder.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@
 public:
     /** Initialize the function
      *
-     * @param[in,out] tensor                Source tensor. Data types supported: U8/S16
+     * @param[in,out] tensor                Source tensor. Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
      * @param[in]     border_width          The border width
      * @param[in]     border_mode           Strategy to use for borders.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
@@ -47,7 +47,7 @@
     /** Initialize the function
      *
      * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] tensor                Source tensor. Data types supported: U8/S16
+     * @param[in,out] tensor                Source tensor. Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
      * @param[in]     border_width          The border width
      * @param[in]     border_mode           Strategy to use for borders.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.

diff --git a/arm_compute/runtime/CL/functions/CLFlattenLayer.h b/arm_compute/runtime/CL/functions/CLFlattenLayer.h
index 98cf49a..b8139c2 100644
--- a/arm_compute/runtime/CL/functions/CLFlattenLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFlattenLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLFloor.h b/arm_compute/runtime/CL/functions/CLFloor.h
index 2844a56..93c3639 100644
--- a/arm_compute/runtime/CL/functions/CLFloor.h
+++ b/arm_compute/runtime/CL/functions/CLFloor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index 188117f..2978874 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
index 9057440..de6d561 100644
--- a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
+++ b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index f558811..8e4d390 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,7 +74,7 @@
 
     /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel
      *
-     * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in] input Input tensor. Data types supported: All
      * @param[in] info  RHS matrix information to be used for reshaping.
      */
     void configure(const ICLTensor *input, GEMMRHSMatrixInfo info)
@@ -85,7 +85,7 @@
     /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel
      *
      * @param[in] compile_context The compile context to be used.
-     * @param[in] input           Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in] input           Input tensor. Data types supported: All
      * @param[in] info            RHS matrix information to be used for reshaping.
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info)

diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 6d1181e..277b27f 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -98,8 +98,8 @@
 public:
     /** Configures the @ref CLConvolutionLayerReshapeWeights function
      *
-     * @param[in] input      Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] biases     Biases tensor. Data type supported: Same as @p input.
+     * @param[in] input      Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32.
+     * @param[in] biases     Biases tensor. Data type supported: same as @p input, S32 if @p input is quantized.
      * @param[in] num_groups Number of groups when performing a grouped convolution.
      */
     void configure(const ICLTensor *input, const ICLTensor *biases, unsigned int num_groups)
@@ -109,8 +109,8 @@
     /** Configures the @ref CLConvolutionLayerReshapeWeights function
      *
      * @param[in] compile_context The compile context to be used.
-     * @param[in] input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] biases          Biases tensor. Data type supported: Same as @p input.
+     * @param[in] input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32.
+     * @param[in] biases          Biases tensor. Data type supported: same as @p input, S32 if @p input is quantized.
      * @param[in] num_groups      Number of groups when performing a grouped convolution.
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, unsigned int num_groups)
@@ -183,11 +183,11 @@
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QASYMM8/F16/F32.
+     *                          Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                          Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                          Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
      * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     *                          Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
      * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                          Data types supported: Same as @p input.
      * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
@@ -204,11 +204,11 @@
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs.
-     *                             Data types supported: QASYMM8/F16/F32.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
      * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                             Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     *                             Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
      * @param[out] output          Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
@@ -225,11 +225,11 @@
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QASYMM8/F16/F32.
+     *                          Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                          Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                          Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
      * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     *                          Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
      * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                          Data types supported: Same as @p input.
      * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
@@ -252,12 +252,12 @@
     /** Configures the appropriate matrix multiply routine
      *
      * @param[in]      compile_context       The compile context to be used.
-     * @param[in]      input                 Input tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[in]      weights               Weights tensor. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in]      input                 Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]      weights               Weights tensor. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
+     *                                       QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
      * @param[in]      biases                Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                                       Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
-     * @param[in, out] output                Output tensor. Data types supported: Same as @p input,
-     *                                       except for input of QASYMM8 type where output should be of S32 type.
+     *                                       Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+     * @param[in, out] output                Output tensor. Data types supported: same as @p input.
      * @param[in]      gemmlowp_output_stage GEMMLowp output stage info
      * @param[in]      gemm_3d_depth         Depth of GEMM 3D
      * @param[in]      act_info              Activation to apply after the matrix multiplication
@@ -267,12 +267,12 @@
                       int gemm_3d_depth, const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
      *
-     * @param[in] input                 Input tensor info. Data types supported: QASYMM8/F16/F32.
-     * @param[in] weights               Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in] input                 Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] weights               Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
+     *                                  QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
      * @param[in] biases                Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                                  Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
-     * @param[in] output                Output tensor info. Data types supported: Same as @p input,
-     *                                  except for input of QASYMM8 type where output should be of S32 type.
+     *                                  Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+     * @param[in] output                Output tensor info. Data types supported: same as @p input.
      * @param[in] gemmlowp_output_stage GEMMLowp output stage info
      * @param[in] gemm_3d_depth         Depth of GEMM 3D
      * @param[in] skip_im2col           Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout.

diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
index d8710a4..1fedeff 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 6ac3cef..57b1e30 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,8 +59,8 @@
      * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
      *  This kernel performs the following computations:
      *
-     *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
-     *  -# Convert b values from QASYMM8 to int32 and add b_offset to each of them.
+     *  -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
+     *  -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
      *  -# Compute the matrix product of the resulting a * b in int32.
      *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
      *
@@ -77,8 +77,8 @@
      * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
      *  This kernel performs the following computations:
      *
-     *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
-     *  -# Convert b values from QASYMM8 to int32 and add b_offset to each of them.
+     *  -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
+     *  -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
      *  -# Compute the matrix product of the resulting a * b in int32.
      *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
      *

diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index 06cb759..c6e9588 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,76 +38,6 @@
 {
 class ITensor;
 
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8Scale on OpenCL.
- *
- *  CLGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
- *  The final result is:
- *
- *  ((input[i][k] + result_offset) * result_mult_int) >> result_shift
- *
- * In case the bias tensor is provided, the final result is:
- *
- *  ((input[i][k] + bias[k] + result_offset) * result_mult_int) >> result_shift
- *
- *  This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToUint8Scale : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input           Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: QASYMM8
-     * @param[in]  result_offset   Offset to be added to each element of the input matrix
-     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
-     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: QASYMM8
-     * @param[in]  result_offset   Offset to be added to each element of the input matrix
-     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
-     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale
-     *
-     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
 /** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL.
  *
  *  CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
@@ -264,65 +194,6 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
 };
 
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat on OpenCL.
- *
- *  This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input      Input tensor. Data type supported: S32
-     * @param[in]  bias       Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                        Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output     Output tensor. Data type supported: QASYMM8
-     * @param[in]  multiplier Float multiplier to be multiplied to each element of the input matrix
-     * @param[in]  offset     Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min        (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max        (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                        Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset, int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: QASYMM8
-     * @param[in]  multiplier      Float multiplier to be multiplied to each element of the input matrix
-     * @param[in]  offset          Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                        Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset,
-                   int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
 /** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on OpenCL.
  *
  *  CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
@@ -442,4 +313,4 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H */
\ No newline at end of file
+#endif /*ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H */

diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h
index dcd9efc..e87a120 100644
--- a/arm_compute/runtime/CL/functions/CLGather.h
+++ b/arm_compute/runtime/CL/functions/CLGather.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLGaussian3x3.h b/arm_compute/runtime/CL/functions/CLGaussian3x3.h
index f1906cd..9fe3e9b 100644
--- a/arm_compute/runtime/CL/functions/CLGaussian3x3.h
+++ b/arm_compute/runtime/CL/functions/CLGaussian3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLGaussian5x5.h b/arm_compute/runtime/CL/functions/CLGaussian5x5.h
index d4ed772..fb369d7 100644
--- a/arm_compute/runtime/CL/functions/CLGaussian5x5.h
+++ b/arm_compute/runtime/CL/functions/CLGaussian5x5.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
index a75a4d1..70f324b 100644
--- a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
+++ b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
index 91b30fa..6d5f2e5 100644
--- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,10 +29,10 @@
 #include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
 #include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -131,9 +131,9 @@
 
     // OpenCL kernels
     CLPermuteKernel              _permute_deltas_kernel;
-    CLReshapeLayerKernel         _flatten_deltas_kernel;
+    CLReshapeLayer               _flatten_deltas;
     CLPermuteKernel              _permute_scores_kernel;
-    CLReshapeLayerKernel         _flatten_scores_kernel;
+    CLReshapeLayer               _flatten_scores;
     CLComputeAllAnchorsKernel    _compute_anchors_kernel;
     CLBoundingBoxTransformKernel _bounding_box_kernel;
     CLPadLayerKernel             _pad_kernel;

diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
index 71280c8..dad7e6e 100644
--- a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
+++ b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h
index c2bdc15..6697b5c 100644
--- a/arm_compute/runtime/CL/functions/CLHOGDetector.h
+++ b/arm_compute/runtime/CL/functions/CLHOGDetector.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
index 450a4a6..b058902 100644
--- a/arm_compute/runtime/CL/functions/CLHOGGradient.h
+++ b/arm_compute/runtime/CL/functions/CLHOGGradient.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
index 3d22ff6..e7631c2 100644
--- a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
+++ b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLHarrisCorners.h b/arm_compute/runtime/CL/functions/CLHarrisCorners.h
index 2d0e78b..90d8c88 100644
--- a/arm_compute/runtime/CL/functions/CLHarrisCorners.h
+++ b/arm_compute/runtime/CL/functions/CLHarrisCorners.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLHistogram.h b/arm_compute/runtime/CL/functions/CLHistogram.h
index 6d34dd7..7fdb8a9 100644
--- a/arm_compute/runtime/CL/functions/CLHistogram.h
+++ b/arm_compute/runtime/CL/functions/CLHistogram.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
index 4614b90..d7aa11c 100644
--- a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLIntegralImage.h b/arm_compute/runtime/CL/functions/CLIntegralImage.h
index 1ea189b..6b10ede 100644
--- a/arm_compute/runtime/CL/functions/CLIntegralImage.h
+++ b/arm_compute/runtime/CL/functions/CLIntegralImage.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
index 91c547b..bc79101 100644
--- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index a29513a..1a8b334 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,19 +26,17 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
@@ -99,7 +97,7 @@
                    const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                    const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                    const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
+                   const ICLTensor *output_state_in, ICLTensor *cell_state_in,
                    ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
                    const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
     /** Initialize function's tensors.
@@ -145,7 +143,7 @@
                    const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                    const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                    const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
+                   const ICLTensor *output_state_in, ICLTensor *cell_state_in,
                    ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
                    const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
 
@@ -202,90 +200,90 @@
     void prepare() override;
 
 private:
-    MemoryGroup                          _memory_group;
-    CLFullyConnectedLayer                _fully_connected_input_gate;
-    CLArithmeticAddition                 _accum_input_gate1;
-    CLSaturatedArithmeticOperationKernel _subtract_input_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_input_gate;
-    CLActivationLayerKernel              _activation_input_gate;
-    CLFullyConnectedLayer                _fully_connected_forget_gate;
-    CLArithmeticAddition                 _accum_forget_gate1;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_forget_gate;
-    CLActivationLayerKernel              _activation_forget_gate;
-    CLFullyConnectedLayer                _fully_connected_cell_state;
-    CLGEMM                               _gemm_cell_state1;
-    CLTransposeKernel                    _transpose_cell_state;
-    CLSaturatedArithmeticOperationKernel _accum_cell_state1;
-    CLSaturatedArithmeticOperationKernel _accum_cell_state2;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_state1;
-    CLActivationLayerKernel              _activation_cell_state;
-    CLActivationLayerKernel              _cell_clip;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_state2;
-    CLFullyConnectedLayer                _fully_connected_output;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_output_state1;
-    CLArithmeticAddition                 _accum_output1;
-    CLActivationLayerKernel              _activation_output;
-    CLActivationLayerKernel              _activation_output_state;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_output_state2;
-    CLFullyConnectedLayer                _fully_connected_output_state;
-    CLActivationLayerKernel              _projection_clip;
-    CLCopyKernel                         _copy_cell_state;
-    CLCopyKernel                         _copy_output;
-    CLConcatenateLayer                   _concat_scratch_buffer;
-    CLWidthConcatenate2TensorsKernel     _concat_inputs_forget_gate;
-    CLWidthConcatenate2TensorsKernel     _concat_weights_forget_gate;
-    CLWidthConcatenate2TensorsKernel     _concat_weights_input_gate;
-    CLWidthConcatenate2TensorsKernel     _concat_weights_output;
-    CLMemsetKernel                       _ones_memset_kernel;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_input_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_input_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_input_gate_bias;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_forget_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_forget_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_forget_gate_bias;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_cell_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_cell_gate_bias;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_output_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_output_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_output_gate_bias;
-    CLTensor                             _input_gate_out1;
-    CLTensor                             _input_gate_out2;
-    CLTensor                             _input_gate_out3;
-    CLTensor                             _input_gate_out4;
-    CLTensor                             _forget_gate_out1;
-    CLTensor                             _forget_gate_out2;
-    CLTensor                             _forget_gate_out3;
-    CLTensor                             _forget_gate_out4;
-    CLTensor                             _forget_gate_out5;
-    CLTensor                             _forget_gate_out6;
-    CLTensor                             _cell_state_out1;
-    CLTensor                             _cell_state_out2;
-    CLTensor                             _cell_state_out3;
-    CLTensor                             _cell_state_out4;
-    CLTensor                             _cell_state_out5;
-    CLTensor                             _output1;
-    CLTensor                             _output2;
-    CLTensor                             _output3;
-    CLTensor                             _output4;
-    CLTensor                             _cell_state_activation;
-    CLTensor                             _output_state1;
-    CLTensor                             _ones;
-    CLTensor                             _input_layer_norm_out1;
-    CLTensor                             _input_layer_norm_out2;
-    CLTensor                             _forget_layer_norm_out1;
-    CLTensor                             _forget_layer_norm_out2;
-    CLTensor                             _cell_layer_norm_out1;
-    CLTensor                             _cell_layer_norm_out2;
-    CLTensor                             _output_layer_norm_out1;
-    CLTensor                             _output_layer_norm_out2;
-    bool                                 _run_peephole_opt;
-    bool                                 _run_cifg_opt;
-    bool                                 _perform_cell_clipping;
-    bool                                 _has_projection_weights;
-    bool                                 _perform_projection_clipping;
-    bool                                 _is_prepared;
-    bool                                 _is_layer_norm_lstm;
+    MemoryGroup                    _memory_group;
+    CLFullyConnectedLayer          _fully_connected_input_gate;
+    CLArithmeticAddition           _accum_input_gate1;
+    CLArithmeticSubtraction        _subtract_input_gate;
+    CLPixelWiseMultiplication      _pixelwise_mul_input_gate;
+    CLActivationLayer              _activation_input_gate;
+    CLFullyConnectedLayer          _fully_connected_forget_gate;
+    CLArithmeticAddition           _accum_forget_gate1;
+    CLPixelWiseMultiplication      _pixelwise_mul_forget_gate;
+    CLActivationLayer              _activation_forget_gate;
+    CLFullyConnectedLayer          _fully_connected_cell_state;
+    CLGEMM                         _gemm_cell_state1;
+    CLTransposeKernel              _transpose_cell_state;
+    CLArithmeticAddition           _accum_cell_state1;
+    CLArithmeticAddition           _accum_cell_state2;
+    CLPixelWiseMultiplication      _pixelwise_mul_cell_state1;
+    CLActivationLayer              _activation_cell_state;
+    CLActivationLayer              _cell_clip;
+    CLPixelWiseMultiplication      _pixelwise_mul_cell_state2;
+    CLFullyConnectedLayer          _fully_connected_output;
+    CLPixelWiseMultiplication      _pixelwise_mul_output_state1;
+    CLArithmeticAddition           _accum_output1;
+    CLActivationLayer              _activation_output;
+    CLActivationLayer              _activation_output_state;
+    CLPixelWiseMultiplication      _pixelwise_mul_output_state2;
+    CLFullyConnectedLayer          _fully_connected_output_state;
+    CLActivationLayer              _projection_clip;
+    CLCopyKernel                   _copy_cell_state;
+    CLCopyKernel                   _copy_output;
+    CLConcatenateLayer             _concat_scratch_buffer;
+    CLConcatenateLayer             _concat_inputs_forget_gate;
+    CLConcatenateLayer             _concat_weights_forget_gate;
+    CLConcatenateLayer             _concat_weights_input_gate;
+    CLConcatenateLayer             _concat_weights_output;
+    CLMemsetKernel                 _ones_memset_kernel;
+    CLMeanStdDevNormalizationLayer _mean_std_norm_input_gate;
+    CLPixelWiseMultiplication      _pixelwise_mul_input_gate_coeff;
+    CLArithmeticAddition           _accum_input_gate_bias;
+    CLMeanStdDevNormalizationLayer _mean_std_norm_forget_gate;
+    CLPixelWiseMultiplication      _pixelwise_mul_forget_gate_coeff;
+    CLArithmeticAddition           _accum_forget_gate_bias;
+    CLMeanStdDevNormalizationLayer _mean_std_norm_cell_gate;
+    CLPixelWiseMultiplication      _pixelwise_mul_cell_gate_coeff;
+    CLArithmeticAddition           _accum_cell_gate_bias;
+    CLMeanStdDevNormalizationLayer _mean_std_norm_output_gate;
+    CLPixelWiseMultiplication      _pixelwise_mul_output_gate_coeff;
+    CLArithmeticAddition           _accum_output_gate_bias;
+    CLTensor                       _input_gate_out1;
+    CLTensor                       _input_gate_out2;
+    CLTensor                       _input_gate_out3;
+    CLTensor                       _input_gate_out4;
+    CLTensor                       _forget_gate_out1;
+    CLTensor                       _forget_gate_out2;
+    CLTensor                       _forget_gate_out3;
+    CLTensor                       _forget_gate_out4;
+    CLTensor                       _forget_gate_out5;
+    CLTensor                       _forget_gate_out6;
+    CLTensor                       _cell_state_out1;
+    CLTensor                       _cell_state_out2;
+    CLTensor                       _cell_state_out3;
+    CLTensor                       _cell_state_out4;
+    CLTensor                       _cell_state_out5;
+    CLTensor                       _output1;
+    CLTensor                       _output2;
+    CLTensor                       _output3;
+    CLTensor                       _output4;
+    CLTensor                       _cell_state_activation;
+    CLTensor                       _output_state1;
+    CLTensor                       _ones;
+    CLTensor                       _input_layer_norm_out1;
+    CLTensor                       _input_layer_norm_out2;
+    CLTensor                       _forget_layer_norm_out1;
+    CLTensor                       _forget_layer_norm_out2;
+    CLTensor                       _cell_layer_norm_out1;
+    CLTensor                       _cell_layer_norm_out2;
+    CLTensor                       _output_layer_norm_out1;
+    CLTensor                       _output_layer_norm_out2;
+    bool                           _run_peephole_opt;
+    bool                           _run_cifg_opt;
+    bool                           _perform_cell_clipping;
+    bool                           _has_projection_weights;
+    bool                           _perform_projection_clipping;
+    bool                           _is_prepared;
+    bool                           _is_layer_norm_lstm;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLLSTMLAYER_H */

diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
index 082fdb4..0829052 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
index 49a87ba..e1a8b25 100644
--- a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
+++ b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
index 2c7afde..4ccc1a4 100644
--- a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
+++ b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
index 7a43eab..59d0db6 100644
--- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLMagnitude.h b/arm_compute/runtime/CL/functions/CLMagnitude.h
index e52ab24..ad7cc77 100644
--- a/arm_compute/runtime/CL/functions/CLMagnitude.h
+++ b/arm_compute/runtime/CL/functions/CLMagnitude.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
new file mode 100644
index 0000000..5c8548f
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h

@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H
+#define ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Function to perform MaxUnpooling. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLMemsetKernel
+ * -# @ref CLMaxUnpoolingLayerKernel
+ */
+class CLMaxUnpoolingLayer : public IFunction
+{
+public:
+    /** Constructor */
+    CLMaxUnpoolingLayer();
+    /** Set the input and output tensors.
+     *
+     * @note Output shape must be equal to the shape of the original input to pool.
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices   Tensor containing the offset to store the input elements in the output tensor.
+     *                       @ref CLPoolingLayer with indices should precede this function in order to
+     *                       properly reconstruct the output tensor.
+     *                       The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    /** Set the input and output tensors.
+     *
+     * @note Output shape must be equal to the shape of the original input to pool.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices         Tensor containing the offset to store the input elements in the output tensor.
+     *                             @ref CLPoolingLayer with indices should precede this function in order to
+     *                             properly reconstruct the output tensor.
+     *                             The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayer
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] indices   TensorInfo associated to the tensor containing the offset to store the input elements in the output tensor.
+     *                      @ref CLPoolingLayer with indices should precede this function in order to
+     *                      properly reconstruct the output tensor.
+     *                      The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLMemsetKernel            _memset_kernel;
+    CLMaxUnpoolingLayerKernel _unpooling_layer_kernel;
+};
+}
+#endif /* ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H */

diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDev.h b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
index 561ac04..be192a7 100644
--- a/arm_compute/runtime/CL/functions/CLMeanStdDev.h
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDev.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
index e39a590..1627de1 100644
--- a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLMedian3x3.h b/arm_compute/runtime/CL/functions/CLMedian3x3.h
index f3bb283..7f67f95 100644
--- a/arm_compute/runtime/CL/functions/CLMedian3x3.h
+++ b/arm_compute/runtime/CL/functions/CLMedian3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
index e9e3bd9..04926f7 100644
--- a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
+++ b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
index 79f73ea..8b7e350 100644
--- a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
+++ b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
index e2c0c4f..556de1c 100644
--- a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
+++ b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
index 07bb62c..a2d46b3 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
index 5dd3760..cf4a9b6 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
index 12d0583..adce674 100644
--- a/arm_compute/runtime/CL/functions/CLOpticalFlow.h
+++ b/arm_compute/runtime/CL/functions/CLOpticalFlow.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLPReluLayer.h b/arm_compute/runtime/CL/functions/CLPReluLayer.h
index 74fa86a..8474350 100644
--- a/arm_compute/runtime/CL/functions/CLPReluLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPReluLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,72 @@
 #ifndef ARM_COMPUTE_CLPRELULAYER_H
 #define ARM_COMPUTE_CLPRELULAYER_H
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
 
+namespace experimental
+{
 /** Basic function to run @ref CLArithmeticOperationKernel for PRELU
  *
  * @note The function implements an activation layer with the PRELU activation function.
  */
-class CLPReluLayer : public ICLSimpleFunction
+class CLPReluLayer : public ICLOperator
 {
 public:
+    /** Default Constructor */
+    CLPReluLayer();
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  alpha           PRelu layer parameters. Data types supported: same of @p input.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPReluLayer
+     *
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] alpha  PRelu layer parameters. Data types supported: same of @p input.
+     * @param[in] output Destination tensor info. Data type supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+} // namespace experimental
+
+/** Basic function to run @ref CLArithmeticOperationKernel for PRELU
+ *
+ * @note The function implements an activation layer with the PRELU activation function.
+ */
+class CLPReluLayer : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLPReluLayer();
+    /** Default Destructor */
+    ~CLPReluLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPReluLayer(const CLPReluLayer &) = delete;
+    /** Default move constructor */
+    CLPReluLayer(CLPReluLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPReluLayer &operator=(const CLPReluLayer &) = delete;
+    /** Default move assignment operator */
+    CLPReluLayer &operator=(CLPReluLayer &&);
     /** Set the input and output tensor.
      *
      * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
@@ -66,6 +118,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLPRELULAYER_H */

diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h
index 82d72053..e3a923f 100644
--- a/arm_compute/runtime/CL/functions/CLPadLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPadLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLPermute.h b/arm_compute/runtime/CL/functions/CLPermute.h
index 37e651c..abc23ef 100644
--- a/arm_compute/runtime/CL/functions/CLPermute.h
+++ b/arm_compute/runtime/CL/functions/CLPermute.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLPhase.h b/arm_compute/runtime/CL/functions/CLPhase.h
index f993906..2731a08 100644
--- a/arm_compute/runtime/CL/functions/CLPhase.h
+++ b/arm_compute/runtime/CL/functions/CLPhase.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
index 8b0ee70..2066012 100644
--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,70 +24,73 @@
 #ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H
 #define ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 // Forward declaration
 class ICLTensor;
 
+namespace experimental
+{
 /** Basic function to run @ref CLPixelWiseMultiplicationKernel. */
-class CLPixelWiseMultiplication : public ICLSimpleFunction
+class CLPixelWiseMultiplication : public ICLOperator
 {
 public:
+    /** Default Constructor */
+    CLPixelWiseMultiplication();
     /** Initialise the kernel's inputs, output and convertion policy.
      *
-     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
-     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          The output tensor. Data types supported:
-     *                                 - U8, only if both input are U8
-     *                                 - QASYMM8, only if both inputs are QASYMM8
-     *                                 - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
-     *                                 - S16
-     *                                 - QSYMM16, only if both inputs are QSYMM16
-     *                                 - S32, only if both inputs are QSYMM16
-     *                                 - F16
-     *                                 - F32
-     * @param[in]      scale           Scale to apply after multiplication.
-     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Initialise the kernel's inputs, output and convertion policy.
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
      *
      * @param[in]      compile_context The compile context to be used.
      * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
+     * @param[in, out] input2          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
+     * @param[out]     output          The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in]      scale           Scale to apply after multiplication.
      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
                    ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication
      *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     *
      * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
-     * @param[in] output          The output tensor info. Data types supported:
-     *                            - U8, only if both input are U8
-     *                            - QASYMM8, only if both inputs are QASYMM8
-     *                            - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
-     *                            - S16
-     *                            - QSYMM16, only if both inputs are QSYMM16
-     *                            - S32, only if both inputs are QSYMM16
-     *                            - F16
-     *                            - F32
+     * @param[in] input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
@@ -98,12 +101,178 @@
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    CLFillBorderKernel _border_handler;
 };
 
 /** Basic function to run @ref CLComplexPixelWiseMultiplicationKernel. */
-class CLComplexPixelWiseMultiplication : public ICLSimpleFunction
+class CLComplexPixelWiseMultiplication : public ICLOperator
 {
 public:
+    /** Default Constructor */
+    CLComplexPixelWiseMultiplication();
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          An input tensor. Data types supported: F32. Number of channels supported: 2.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplication
+     *
+     * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2.
+     * @param[in] input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in] output   The output tensor info, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    CLFillBorderKernel _border_handler;
+};
+} // namespace experimental
+
+/** Basic function to run @ref CLPixelWiseMultiplicationKernel. */
+class CLPixelWiseMultiplication : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLPixelWiseMultiplication();
+    /** Default Destructor */
+    ~CLPixelWiseMultiplication();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPixelWiseMultiplication(const CLPixelWiseMultiplication &) = delete;
+    /** Default move constructor */
+    CLPixelWiseMultiplication(CLPixelWiseMultiplication &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPixelWiseMultiplication &operator=(const CLPixelWiseMultiplication &) = delete;
+    /** Default move assignment operator */
+    CLPixelWiseMultiplication &operator=(CLPixelWiseMultiplication &&);
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]      scale           Scale to apply after multiplication.
+     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
+                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]      scale           Scale to apply after multiplication.
+     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
+                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     *
+     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] scale           Scale to apply after multiplication.
+     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
+                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to run @ref CLComplexPixelWiseMultiplicationKernel. */
+class CLComplexPixelWiseMultiplication : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLComplexPixelWiseMultiplication();
+    /** Default Destructor */
+    ~CLComplexPixelWiseMultiplication();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComplexPixelWiseMultiplication(const CLComplexPixelWiseMultiplication &) = delete;
+    /** Default move constructor */
+    CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComplexPixelWiseMultiplication &operator=(const CLComplexPixelWiseMultiplication &) = delete;
+    /** Default move assignment operator */
+    CLComplexPixelWiseMultiplication &operator=(CLComplexPixelWiseMultiplication &&);
     /** Initialise the kernel's inputs, output.
      *
      * @param[in, out] input1   An input tensor. Data types supported: F32. Number of channels supported: 2.
@@ -133,6 +302,13 @@
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H */

diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 7d646ab..96dacf9 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
index d39e411..9a78e77 100644
--- a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index 67e8bc7..53f337b 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,14 +25,14 @@
 #define ARM_COMPUTE_CLQLSTMLAYER_H
 
 #include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
 
 #include "arm_compute/runtime/common/LSTMParams.h"
@@ -48,11 +48,11 @@
  *
  * -# @ref CLActivationLayer                                     Activation functions (tanh and logistic)
  * -# @ref CLCopyKernel                                          Copy kernel for copying output_state_out to output
- * -# @ref CLSaturatedArithmeticOperationKernel                  Elementwise addition and subtraction
+ * -# @ref CLArithmeticAddition                  Elementwise addition and subtraction
  * -# @ref CLGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
  * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
  * -# @ref CLGEMMLowpMatrixAReductionKernel                      For precomputing effective biases to use
- * -# @ref CLPixelWiseMultiplicationKernel                       Elementwise multiplication
+ * -# @ref CLPixelWiseMultiplication                       Elementwise multiplication
  * -# @ref CLTranspose                                           Transpose function for reshaping the weights
  * */
 class CLQLSTMLayer : public IFunction
@@ -113,7 +113,7 @@
                    const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                    const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                    const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
                    ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
                    const LSTMParams<ICLTensor> &lstm_params);
 
@@ -163,7 +163,7 @@
                    const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                    const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                    const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
                    ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
                    const LSTMParams<ICLTensor> &lstm_params);
 
@@ -285,70 +285,70 @@
     };
 
     // Functions used
-    CLTranspose                          _transpose_input_to_forget_weights{};
-    CLTranspose                          _transpose_input_to_cell_weights{};
-    CLTranspose                          _transpose_input_to_output_weights{};
-    CLTranspose                          _transpose_input_to_input_weights{};
-    CLTranspose                          _transpose_recurrent_to_forget_weights{};
-    CLTranspose                          _transpose_recurrent_to_cell_weights{};
-    CLTranspose                          _transpose_recurrent_to_output_weights{};
-    CLTranspose                          _transpose_recurrent_to_input_weights{};
-    CLTranspose                          _transpose_projection_weights{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_input_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_input_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_forget_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_forget_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_cell_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_cell_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_output_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_output_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _projection_reduction{};
-    CLSaturatedArithmeticOperationKernel _projection_bias_add{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_forget{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_forget{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_forget{};
-    CLGEMMLowpOutputStage                _input_to_forget_outstage{};
-    CLGEMMLowpOutputStage                _recurrent_to_forget_outstage{};
-    CLGEMMLowpOutputStage                _cell_to_forget_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_forget{};
-    CLSaturatedArithmeticOperationKernel _accumulate_cell_forget{};
-    CLActivationLayer                    _forget_gate_sigmoid{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_cell{};
-    CLGEMMLowpOutputStage                _input_to_cell_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_cell{};
-    CLGEMMLowpOutputStage                _recurrent_to_cell_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_modulation{};
-    CLActivationLayer                    _cell_gate_tanh{};
-    CLSaturatedArithmeticOperationKernel _input_gate_sub{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_input{};
-    CLGEMMLowpOutputStage                _input_to_input_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_input{};
-    CLGEMMLowpOutputStage                _recurrent_to_input_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_input{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_input{};
-    CLGEMMLowpOutputStage                _cell_to_input_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_cell_input{};
-    CLActivationLayer                    _input_gate_sigmoid{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_forget_cell{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_input_cell{};
-    CLSaturatedArithmeticOperationKernel _add_forget_cell{};
-    CLActivationLayer                    _cell_clip{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_output{};
-    CLGEMMLowpOutputStage                _input_to_output_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_output{};
-    CLGEMMLowpOutputStage                _recurrent_to_output_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_output{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_output{};
-    CLGEMMLowpOutputStage                _cell_to_output_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_cell_to_output{};
-    CLActivationLayer                    _output_gate_sigmoid{};
-    CLActivationLayer                    _hidden_tanh{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_hidden{};
-    CLGEMMLowpOutputStage                _hidden_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_projection{};
-    CLGEMMLowpOutputStage                _projection_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_projection{};
-    CLActivationLayer                    _projection_clip{};
+    CLTranspose                      _transpose_input_to_forget_weights{};
+    CLTranspose                      _transpose_input_to_cell_weights{};
+    CLTranspose                      _transpose_input_to_output_weights{};
+    CLTranspose                      _transpose_input_to_input_weights{};
+    CLTranspose                      _transpose_recurrent_to_forget_weights{};
+    CLTranspose                      _transpose_recurrent_to_cell_weights{};
+    CLTranspose                      _transpose_recurrent_to_output_weights{};
+    CLTranspose                      _transpose_recurrent_to_input_weights{};
+    CLTranspose                      _transpose_projection_weights{};
+    CLGEMMLowpMatrixAReductionKernel _input_to_input_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _input_to_output_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{};
+    CLGEMMLowpMatrixAReductionKernel _projection_reduction{};
+    CLArithmeticAddition             _projection_bias_add{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_forget{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_forget{};
+    CLPixelWiseMultiplication        _pixelwise_mul_cell_to_forget{};
+    CLGEMMLowpOutputStage            _input_to_forget_outstage{};
+    CLGEMMLowpOutputStage            _recurrent_to_forget_outstage{};
+    CLGEMMLowpOutputStage            _cell_to_forget_outstage{};
+    CLArithmeticAddition             _accumulate_input_recurrent_forget{};
+    CLArithmeticAddition             _accumulate_cell_forget{};
+    CLActivationLayer                _forget_gate_sigmoid{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_cell{};
+    CLGEMMLowpOutputStage            _input_to_cell_outstage{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_cell{};
+    CLGEMMLowpOutputStage            _recurrent_to_cell_outstage{};
+    CLArithmeticAddition             _accumulate_input_recurrent_modulation{};
+    CLActivationLayer                _cell_gate_tanh{};
+    CLArithmeticSubtraction          _input_gate_sub{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_input{};
+    CLGEMMLowpOutputStage            _input_to_input_outstage{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_input{};
+    CLGEMMLowpOutputStage            _recurrent_to_input_outstage{};
+    CLArithmeticAddition             _accumulate_input_recurrent_input{};
+    CLPixelWiseMultiplication        _pixelwise_mul_cell_to_input{};
+    CLGEMMLowpOutputStage            _cell_to_input_outstage{};
+    CLArithmeticAddition             _accumulate_cell_input{};
+    CLActivationLayer                _input_gate_sigmoid{};
+    CLPixelWiseMultiplication        _pixelwise_mul_forget_cell{};
+    CLPixelWiseMultiplication        _pixelwise_mul_input_cell{};
+    CLArithmeticAddition             _add_forget_cell{};
+    CLActivationLayer                _cell_clip{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_input_to_output{};
+    CLGEMMLowpOutputStage            _input_to_output_outstage{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_output{};
+    CLGEMMLowpOutputStage            _recurrent_to_output_outstage{};
+    CLArithmeticAddition             _accumulate_input_recurrent_output{};
+    CLPixelWiseMultiplication        _pixelwise_mul_cell_to_output{};
+    CLGEMMLowpOutputStage            _cell_to_output_outstage{};
+    CLArithmeticAddition             _accumulate_cell_to_output{};
+    CLActivationLayer                _output_gate_sigmoid{};
+    CLActivationLayer                _hidden_tanh{};
+    CLPixelWiseMultiplication        _pixelwise_mul_hidden{};
+    CLGEMMLowpOutputStage            _hidden_outstage{};
+    CLGEMMLowpMatrixMultiplyCore     _mm_projection{};
+    CLGEMMLowpOutputStage            _projection_outstage{};
+    CLArithmeticAddition             _accumulate_projection{};
+    CLActivationLayer                _projection_clip{};
     std::array<CLQLSTMLayerNormalizationKernel, _layer_norm_count> _layer_norms{ {} };
     CLCopyKernel _copy_output{};
 
@@ -358,7 +358,10 @@
     TensorCopyKernel _hidden_to_output_copy{};
 
     // Tensor pointers
-    const ICLTensor *_input_to_input_weights{ nullptr };
+    const ICLTensor *_input_to_input_weights
+    {
+        nullptr
+    };
     const ICLTensor *_recurrent_to_input_weights{ nullptr };
     const ICLTensor *_projection_bias{ nullptr };
     const ICLTensor *_input_to_forget_weights{ nullptr };

diff --git a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
index f59e3b7..e045adf 100644
--- a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index 0291eb1..9d1cb1a 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLRNN_LAYER_H
 #define ARM_COMPUTE_CLRNN_LAYER_H
 
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 
@@ -85,16 +85,16 @@
     void prepare() override;
 
 private:
-    MemoryGroup                          _memory_group;
-    CLGEMM                               _gemm_state_f;
-    CLSaturatedArithmeticOperationKernel _add_kernel;
-    CLActivationLayerKernel              _activation_kernel;
-    CLFullyConnectedLayer                _fully_connected_kernel;
-    CLCopyKernel                         _copy_kernel;
-    CLTensor                             _fully_connected_out;
-    CLTensor                             _gemm_output;
-    CLTensor                             _add_output;
-    bool                                 _is_prepared;
+    MemoryGroup           _memory_group;
+    CLGEMM                _gemm_state_f;
+    CLArithmeticAddition  _add_kernel;
+    CLActivationLayer     _activation;
+    CLFullyConnectedLayer _fully_connected_kernel;
+    CLCopyKernel          _copy_kernel;
+    CLTensor              _fully_connected_out;
+    CLTensor              _gemm_output;
+    CLTensor              _add_output;
+    bool                  _is_prepared;
 };
 }
 #endif /* ARM_COMPUTE_CLRNN_LAYER_H */

diff --git a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
index b6defe6..2e78f16 100644
--- a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
index 0376e78..3013927 100644
--- a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLRange.h b/arm_compute/runtime/CL/functions/CLRange.h
index 19e11ba..a86cfb6 100644
--- a/arm_compute/runtime/CL/functions/CLRange.h
+++ b/arm_compute/runtime/CL/functions/CLRange.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h
index 57ec48d..88ead9d 100644
--- a/arm_compute/runtime/CL/functions/CLReduceMean.h
+++ b/arm_compute/runtime/CL/functions/CLReduceMean.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 25cf655..5d050d7 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,19 +26,17 @@
 
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
-#include <cstdint>
 #include <memory>
-#include <vector>
 
 namespace arm_compute
 {
+// Forward declarations
 class ICLTensor;
 
 /** Perform reduction operation.
@@ -54,7 +52,7 @@
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32/S32.
      * @param[out] output    Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  axis      Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
      * @param[in]  op        Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
@@ -64,7 +62,7 @@
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32/S32.
      * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
      * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
@@ -74,7 +72,7 @@
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation.
      *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32/S32.
      * @param[in] output    Destination tensor info. Data types and data layouts supported: Same as @p input.
      * @param[in] axis      Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
      * @param[in] op        Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
@@ -94,8 +92,7 @@
     std::vector<CLTensor>                   _results_vector;
     std::vector<CLReductionOperationKernel> _reduction_kernels_vector;
     std::vector<CLFillBorderKernel>         _border_handlers_vector;
-    CLReshapeLayerKernel                    _reshape_kernel;
-    ReductionOperation                      _op;
+    CLReshapeLayer                          _reshape;
     unsigned int                            _num_of_stages;
     unsigned int                            _reduction_axis;
     bool                                    _is_serial;

diff --git a/arm_compute/runtime/CL/functions/CLRemap.h b/arm_compute/runtime/CL/functions/CLRemap.h
index dc8a2c4..5b110d5 100644
--- a/arm_compute/runtime/CL/functions/CLRemap.h
+++ b/arm_compute/runtime/CL/functions/CLRemap.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLReorgLayer.h b/arm_compute/runtime/CL/functions/CLReorgLayer.h
index 8b245ab..a7287ce 100644
--- a/arm_compute/runtime/CL/functions/CLReorgLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReorgLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
index e91c2c7..7fc6c3b 100644
--- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CLRESHAPELAYER_H
 #define ARM_COMPUTE_CLRESHAPELAYER_H
 
+#include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
@@ -31,9 +32,21 @@
 class ICLTensor;
 
 /** Basic function to run @ref CLReshapeLayerKernel */
-class CLReshapeLayer : public ICLSimpleFunction
+class CLReshapeLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLReshapeLayer();
+    /** Default Destructor */
+    ~CLReshapeLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReshapeLayer(const CLReshapeLayer &) = delete;
+    /** Default move constructor */
+    CLReshapeLayer(CLReshapeLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReshapeLayer &operator=(const CLReshapeLayer &) = delete;
+    /** Default move assignment operator */
+    CLReshapeLayer &operator=(CLReshapeLayer &&);
     /** Initialise the kernel's inputs and outputs
      *
      * @param[in]  input  First tensor input. Data type supported: All
@@ -56,6 +69,38 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
+
+namespace experimental
+{
+/** Basic function to run @ref CLReshapeLayerKernel */
+class CLReshape : public ICLOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data type supported: All
+     * @param[out] output          Output info. Data type supported: Same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLReshapeLayer
+     *
+     * @param[in] input  Input tensor info. Data type supported: All
+     * @param[in] output Output tensor info. Data type supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace experimental
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLRESHAPELAYER_H */

diff --git a/arm_compute/runtime/CL/functions/CLReverse.h b/arm_compute/runtime/CL/functions/CLReverse.h
index 87ae34c..6b14092 100644
--- a/arm_compute/runtime/CL/functions/CLReverse.h
+++ b/arm_compute/runtime/CL/functions/CLReverse.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
index f345995..d776e83 100644
--- a/arm_compute/runtime/CL/functions/CLScale.h
+++ b/arm_compute/runtime/CL/functions/CLScale.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CLSCALE_H
 #define ARM_COMPUTE_CLSCALE_H
 
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
@@ -39,6 +40,23 @@
 public:
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
+     * @param[in,out] input  Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output Destination tensor. Data types supported: Same as @p input
+     *                       All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]     info   @ref ScaleKernelInfo descriptor to be used to configure
+     */
+    void configure(ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] input           Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output          Destination tensor. Data types supported: Same as @p input
+     *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
      * @param[in,out] input                 Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[out]    output                Destination tensor. Data types supported: Same as @p input
      *                                      All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
@@ -49,6 +67,7 @@
      * @param[in]     use_padding           (Optional) Is padding in use or not. Defaults to true.
      * @param[in]     align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.08)
     void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
                    SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
     /** Initialize the function's source, destination, interpolation type and border_mode.
@@ -64,11 +83,22 @@
      * @param[in]     use_padding           (Optional) Is padding in use or not. Defaults to true.
      * @param[in]     align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.08)
     void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
                    SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLScale
      *
+     * @param[in] input  Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32.
+     * @param[in] output Output tensor info. Data type supported: Same as @p input
+     *                   All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in] info   @ref ScaleKernelInfo descriptor to be used to validate
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLScale
+     *
      * @param[in] input                 Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32.
      * @param[in] output                Output tensor info. Data type supported: Same as @p input
      *                                  All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
@@ -81,6 +111,7 @@
      *
      * @return a status
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.08)
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
                            SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
 };

diff --git a/arm_compute/runtime/CL/functions/CLScharr3x3.h b/arm_compute/runtime/CL/functions/CLScharr3x3.h
index b25b548..3892874 100644
--- a/arm_compute/runtime/CL/functions/CLScharr3x3.h
+++ b/arm_compute/runtime/CL/functions/CLScharr3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLSelect.h b/arm_compute/runtime/CL/functions/CLSelect.h
index 84d0997..a1af922 100644
--- a/arm_compute/runtime/CL/functions/CLSelect.h
+++ b/arm_compute/runtime/CL/functions/CLSelect.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLSlice.h b/arm_compute/runtime/CL/functions/CLSlice.h
index a8c6e1f..23c398c 100644
--- a/arm_compute/runtime/CL/functions/CLSlice.h
+++ b/arm_compute/runtime/CL/functions/CLSlice.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,15 +24,18 @@
 #ifndef ARM_COMPUTE_CL_SLICE_H
 #define ARM_COMPUTE_CL_SLICE_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 // Forward Declarations
 class ICLTensor;
 
+namespace experimental
+{
 /** Basic function to perform tensor slicing */
-class CLSlice : public ICLSimpleFunction
+class CLSlice : public ICLOperator
 {
 public:
     /** Configure kernel
@@ -42,6 +45,55 @@
      * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
      * @note End indices are not inclusive unless negative.
      *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor info. Data type supported: All.
+     * @param[out] output          Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts          The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends            The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSlice
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Start indices must be non-negative. 0 <= starts[i]
+     * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
+     * @note End indices are not inclusive unless negative.
+     *
+     * @param[in] input  Source tensor info. Data type supported: All
+     * @param[in] output Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     *
+     * @return A status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+};
+} // namespace experimental
+
+/** Basic function to perform tensor slicing */
+class CLSlice : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLSlice();
+    /** Default Destructor */
+    ~CLSlice();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSlice(const CLSlice &) = delete;
+    /** Default move constructor */
+    CLSlice(CLSlice &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSlice &operator=(const CLSlice &) = delete;
+    /** Default move assignment operator */
+    CLSlice &operator=(CLSlice &&);
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Start indices must be non-negative. 0 <= starts[i]
+     * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
+     * @note End indices are not inclusive unless negative.
+     *
      * @param[in]  input  Source tensor. Data type supported: All.
      * @param[out] output Destination tensor. Data type supported: Same as @p input
      * @param[in]  starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
@@ -78,6 +130,13 @@
      * @return A status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_SLICE_H */

diff --git a/arm_compute/runtime/CL/functions/CLSobel3x3.h b/arm_compute/runtime/CL/functions/CLSobel3x3.h
index 24bc0cd..25d4ed6 100644
--- a/arm_compute/runtime/CL/functions/CLSobel3x3.h
+++ b/arm_compute/runtime/CL/functions/CLSobel3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLSobel5x5.h b/arm_compute/runtime/CL/functions/CLSobel5x5.h
index bf26627..1f91c46 100644
--- a/arm_compute/runtime/CL/functions/CLSobel5x5.h
+++ b/arm_compute/runtime/CL/functions/CLSobel5x5.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLSobel7x7.h b/arm_compute/runtime/CL/functions/CLSobel7x7.h
index 13932c7..91daf64 100644
--- a/arm_compute/runtime/CL/functions/CLSobel7x7.h
+++ b/arm_compute/runtime/CL/functions/CLSobel7x7.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index fadbc43..bb01584 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLSOFTMAXLAYER_H
 #define ARM_COMPUTE_CLSOFTMAXLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -44,11 +44,9 @@
  * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f]
  *
  * Log Softmax is calculated by :
- * @f[ out = (x - max(x) * beta) - \sum{e^{x - max(x) * beta}} @f]
+ * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
  *
  * This function runs the following kernels:
- * -# @ref CLLogits1DMaxKernel
- * -# @ref CLLogits1DShiftExpSumKernel
  * -# @ref CLLogits1DNormKernel
  */
 template <bool IS_LOG = false>
@@ -59,36 +57,31 @@
     CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[out] output Destination tensor. Data types supported: same as @p input
      * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in]  axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in]  axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 1);
+    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 0);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/F16/F32
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[out] output          Destination tensor. Data types supported: same as @p input
      * @param[in]  beta            (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in]  axis            (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in]  axis            (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 1);
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer
      *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[in] output Destination tensor. Data types supported: same as @p input
      * @param[in] beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in] axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
+     *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, size_t axis = 1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, size_t axis = 0);
 
     // Inherited methods overridden:
     void run() override;
@@ -103,9 +96,7 @@
      *
      * @param[in] input  Original source tensor.
      * @param[in] output Original destination tensor.
-     * @param[in] axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
      */
     void configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis);
     /** Utility method to configure the kernels needed to flatten the input
@@ -118,17 +109,15 @@
      * @param[in] compile_context The compile context to be used.
      * @param[in] input           Original source tensor.
      * @param[in] output          Original destination tensor.
-     * @param[in] axis            (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in] axis            (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
      */
     void configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis);
 
     MemoryGroup                    _memory_group;
     CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel;
     CLLogits1DNormKernel           _norm_kernel;
-    std::unique_ptr<ICLKernel>     _flatten_kernel_ptr;
-    CLReshapeLayerKernel           _reshape_kernel;
+    std::unique_ptr<IFunction>     _flatten_ptr;
+    CLReshapeLayer                 _reshape;
     CLTensor                       _max;
     CLTensor                       _sum;
     CLTensor                       _tmp;

diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
index b8e2bdc..c6f7f11 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
index ac011dd..24830cf 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLSplit.h b/arm_compute/runtime/CL/functions/CLSplit.h
index 87265a4..2931203 100644
--- a/arm_compute/runtime/CL/functions/CLSplit.h
+++ b/arm_compute/runtime/CL/functions/CLSplit.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLStackLayer.h b/arm_compute/runtime/CL/functions/CLStackLayer.h
index 9b20445..9587596 100644
--- a/arm_compute/runtime/CL/functions/CLStackLayer.h
+++ b/arm_compute/runtime/CL/functions/CLStackLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h
index bb2bc96..fdbef81 100644
--- a/arm_compute/runtime/CL/functions/CLStridedSlice.h
+++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,9 @@
 #ifndef ARM_COMPUTE_CL_STRIDED_SLICE_H
 #define ARM_COMPUTE_CL_STRIDED_SLICE_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -32,9 +34,24 @@
 class ICLTensor;
 
 /** Basic function to run @ref CLStridedSliceKernel */
-class CLStridedSlice : public ICLSimpleFunction
+class CLStridedSlice : public IFunction
 {
 public:
+    /** Constructor
+     *
+     * @param[in] ctx Runtime context to be used by the function
+     */
+    CLStridedSlice(CLRuntimeContext *ctx = nullptr);
+    /** Destructor */
+    ~CLStridedSlice();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLStridedSlice(const CLStridedSlice &) = delete;
+    /** Default move constructor */
+    CLStridedSlice(CLStridedSlice &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLStridedSlice &operator=(const CLStridedSlice &) = delete;
+    /** Default move assignment operator */
+    CLStridedSlice &operator=(CLStridedSlice &&);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -88,6 +105,58 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *output,
                            const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                            int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
+
+namespace experimental
+{
+/** Basic function to run @ref CLStridedSliceKernel */
+class CLStridedSlice : public ICLOperator
+{
+public:
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor info. Data type supported: All.
+     * @param[out] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
+                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in] input            Source tensor info. Data type supported: All.
+     * @param[in] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+};
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_STRIDED_SLICE_H */

diff --git a/arm_compute/runtime/CL/functions/CLTableLookup.h b/arm_compute/runtime/CL/functions/CLTableLookup.h
index 1c11f07..32d4b7b 100644
--- a/arm_compute/runtime/CL/functions/CLTableLookup.h
+++ b/arm_compute/runtime/CL/functions/CLTableLookup.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLThreshold.h b/arm_compute/runtime/CL/functions/CLThreshold.h
index d8ae6fb..f3af122 100644
--- a/arm_compute/runtime/CL/functions/CLThreshold.h
+++ b/arm_compute/runtime/CL/functions/CLThreshold.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CLTHRESHOLD_H
 #define ARM_COMPUTE_CLTHRESHOLD_H
 
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
@@ -31,6 +32,7 @@
 
 namespace arm_compute
 {
+// Forward declarations
 class ICLTensor;
 
 /** Basic function to run @ref CLThresholdKernel */
@@ -47,23 +49,25 @@
      * @param[in]  type        Thresholding type. Can either be BINARY or RANGE.
      * @param[in]  upper       Upper threshold. Only used with RANGE thresholding
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.08)
     void configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
                    uint8_t false_value = 0, uint8_t true_value = 0,
                    ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0);
     /** Initialise the function's source, destination, thresholds and threshold type
      *
+     * @param[in]  input  First tensor input. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: U8.
+     * @param[in]  info   Threshold  descriptor
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
+    /** Initialise the function's source, destination, thresholds and threshold type
+     *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           First tensor input. Data types supported: U8.
      * @param[out] output          Output tensor. Data types supported: U8.
-     * @param[in]  threshold       Threshold. If upper threshold is specified, this will be used as the lower threshold.
-     * @param[in]  false_value     Value to assign when the condition is false.
-     * @param[in]  true_value      value to assign when the condition is true.
-     * @param[in]  type            Thresholding type. Can either be BINARY or RANGE.
-     * @param[in]  upper           Upper threshold. Only used with RANGE thresholding
+     * @param[in]  info            Threshold descriptor
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                   uint8_t false_value = 0, uint8_t true_value = 0,
-                   ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0);
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info);
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLTHRESHOLD_H */

diff --git a/arm_compute/runtime/CL/functions/CLTile.h b/arm_compute/runtime/CL/functions/CLTile.h
index 0dad9ad..d2f1e97 100644
--- a/arm_compute/runtime/CL/functions/CLTile.h
+++ b/arm_compute/runtime/CL/functions/CLTile.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h
index b2fdcda..9ba7caf 100644
--- a/arm_compute/runtime/CL/functions/CLTranspose.h
+++ b/arm_compute/runtime/CL/functions/CLTranspose.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLUnstack.h b/arm_compute/runtime/CL/functions/CLUnstack.h
index 777da69..5d4d571 100644
--- a/arm_compute/runtime/CL/functions/CLUnstack.h
+++ b/arm_compute/runtime/CL/functions/CLUnstack.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,8 +48,8 @@
     CLUnstack();
     /** Set the input, output and unstacking axis.
      *
-     * @param[in]     input         A tensor to be unstacked. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in,out] output_vector A vector of tensors. Data types supported: Same as @p input.
+     * @param[in]     input         A tensor to be unstacked. Data type supported: All.
+     * @param[in,out] output_vector A vector of tensors. Data types supported: same as @p input.
      *                              Note: The number of elements of the vector will be used as the number of slices to be taken from the axis.
      * @param[in]     axis          The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
@@ -58,8 +58,8 @@
     /** Set the input, output and unstacking axis.
      *
      * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           A tensor to be unstacked. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in,out] output_vector   A vector of tensors. Data types supported: Same as @p input.
+     * @param[in]     input           A tensor to be unstacked. Data type supported: All.
+     * @param[in,out] output_vector   A vector of tensors. Data types supported: same as @p input.
      *                                Note: The number of elements of the vector will be used as the number of slices to be taken from the axis.
      * @param[in]     axis            The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
@@ -67,8 +67,8 @@
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLUnstack
      *
-     * @param[in] input         Input tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-     * @param[in] output_vector Vector of output tensors' info. Data types supported: Same as @p input.
+     * @param[in] input         Input tensor info. Data type supported: All.
+     * @param[in] output_vector Vector of output tensors' info. Data types supported: same as @p input.
      * @param[in] axis          The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
      * @return a status

diff --git a/arm_compute/runtime/CL/functions/CLUpsampleLayer.h b/arm_compute/runtime/CL/functions/CLUpsampleLayer.h
index 5f4f57f..07b4c8a 100644
--- a/arm_compute/runtime/CL/functions/CLUpsampleLayer.h
+++ b/arm_compute/runtime/CL/functions/CLUpsampleLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,7 +53,7 @@
 
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in]  input             Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  input             Source tensor. Data type supported: All.
      * @param[out] output            Destination tensor. Data types supported: same as @p input.
      * @param[in]  info              Contains stride information described in @ref Size2D.
      * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
@@ -63,16 +63,16 @@
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
      * @param[in]  compile_context   The compile context to be used.
-     * @param[in]  input             Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  input             Source tensor. Data type supported: All.
      * @param[out] output            Destination tensor. Data types supported: same as @p input.
      * @param[in]  info              Contains stride information described in @ref Size2D.
      * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
      */
     void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output,
                    const Size2D &info, const InterpolationPolicy upsampling_policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
+    /** Static function to check if given info will lead to a valid configuration of @ref CLUpsampleLayerKernel
      *
-     * @param[in] input             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] input             Source tensor info. Data types supported: All.
      * @param[in] output            Destination tensor info. Data types supported: same as @p input.
      * @param[in] info              Contains  stride information described in @ref Size2D.
      * @param[in] upsampling_policy Defines the policy to fill the intermediate pixels.

diff --git a/arm_compute/runtime/CL/functions/CLWarpAffine.h b/arm_compute/runtime/CL/functions/CLWarpAffine.h
index 1a2fe9d..eb7c05b 100644
--- a/arm_compute/runtime/CL/functions/CLWarpAffine.h
+++ b/arm_compute/runtime/CL/functions/CLWarpAffine.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLWarpPerspective.h b/arm_compute/runtime/CL/functions/CLWarpPerspective.h
index 5db9ec4..2a1f780 100644
--- a/arm_compute/runtime/CL/functions/CLWarpPerspective.h
+++ b/arm_compute/runtime/CL/functions/CLWarpPerspective.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index c1de5f1..602f644 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
index 11a402e..351f880 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/functions/CLYOLOLayer.h b/arm_compute/runtime/CL/functions/CLYOLOLayer.h
index e70d84b..3e403f4 100644
--- a/arm_compute/runtime/CL/functions/CLYOLOLayer.h
+++ b/arm_compute/runtime/CL/functions/CLYOLOLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h
index 47d7874..a6bc008 100644
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h
+++ b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
index 04906e3..815c2c8 100644
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
+++ b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
index c0bc4fc..4689f0c 100644
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
+++ b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
index d9006e7..8712be7 100644
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
+++ b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/tuners/BifrostTuner.h b/arm_compute/runtime/CL/tuners/BifrostTuner.h
index 06f76cb..237693f 100644
--- a/arm_compute/runtime/CL/tuners/BifrostTuner.h
+++ b/arm_compute/runtime/CL/tuners/BifrostTuner.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,7 @@
     // Inherited overriden methods
     void tune_kernel_static(ICLKernel &kernel) override;
     void tune_kernel_dynamic(ICLKernel &kernel) override;
+    void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override;
 };
 } // namespace tuners
 } // namespace arm_compute

diff --git a/arm_compute/runtime/CL/tuners/CLLWSList.h b/arm_compute/runtime/CL/tuners/CLLWSList.h
index 61a1d79..7ce10ac 100644
--- a/arm_compute/runtime/CL/tuners/CLLWSList.h
+++ b/arm_compute/runtime/CL/tuners/CLLWSList.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CL/tuners/MidgardTuner.h b/arm_compute/runtime/CL/tuners/MidgardTuner.h
index 2609f32..86d4604 100644
--- a/arm_compute/runtime/CL/tuners/MidgardTuner.h
+++ b/arm_compute/runtime/CL/tuners/MidgardTuner.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,7 @@
     // Inherited overriden methods
     void tune_kernel_static(ICLKernel &kernel) override;
     void tune_kernel_dynamic(ICLKernel &kernel) override;
+    void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override;
 };
 } // namespace tuners
 } // namespace arm_compute

diff --git a/arm_compute/runtime/CL/tuners/Tuners.h b/arm_compute/runtime/CL/tuners/Tuners.h
index 05d5b68..274f13d 100644
--- a/arm_compute/runtime/CL/tuners/Tuners.h
+++ b/arm_compute/runtime/CL/tuners/Tuners.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CPP/CPPFunctions.h b/arm_compute/runtime/CPP/CPPFunctions.h
index c196381..07c1c7b 100644
--- a/arm_compute/runtime/CPP/CPPFunctions.h
+++ b/arm_compute/runtime/CPP/CPPFunctions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
index c8de41b..e8ad427 100644
--- a/arm_compute/runtime/CPP/CPPScheduler.h
+++ b/arm_compute/runtime/CPP/CPPScheduler.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CPPSCHEDULER_H
 #define ARM_COMPUTE_CPPSCHEDULER_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IScheduler.h"
 
 #include <memory>
@@ -38,16 +39,6 @@
     CPPScheduler();
     /** Default destructor */
     ~CPPScheduler();
-    /** Sets the number of threads the scheduler will use to run the kernels.
-     *
-     * @param[in] num_threads If set to 0, then the maximum number of threads supported by C++11 will be used, otherwise the number of threads specified.
-     */
-    void set_num_threads(unsigned int num_threads) override;
-    /** Returns the number of threads that the CPPScheduler has in his pool.
-     *
-     * @return Number of threads available in CPPScheduler.
-     */
-    unsigned int num_threads() const override;
 
     /** Access the scheduler singleton
      *
@@ -55,16 +46,13 @@
      * @return The scheduler
      */
     static CPPScheduler &get();
-    /** Multithread the execution of the passed kernel if possible.
-     *
-     * The kernel will run on a single thread if any of these conditions is true:
-     * - ICPPKernel::is_parallelisable() returns false
-     * - The scheduler has been initialized with only one thread.
-     *
-     * @param[in] kernel Kernel to execute.
-     * @param[in] hints  Hints for the scheduler.
-     */
+
+    // Inherited functions overridden
+    void set_num_threads(unsigned int num_threads) override;
+    void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) override;
+    unsigned int num_threads() const override;
     void schedule(ICPPKernel *kernel, const Hints &hints) override;
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors) override;
 
 protected:
     /** Will run the workloads in parallel using num_threads
@@ -74,6 +62,7 @@
     void run_workloads(std::vector<Workload> &workloads) override;
 
 private:
+    void schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors);
     struct Impl;
     std::unique_ptr<Impl> _impl;
 };

diff --git a/arm_compute/runtime/CPP/ICPPSimpleFunction.h b/arm_compute/runtime/CPP/ICPPSimpleFunction.h
index 999e436..49b5cd8 100644
--- a/arm_compute/runtime/CPP/ICPPSimpleFunction.h
+++ b/arm_compute/runtime/CPP/ICPPSimpleFunction.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
index b6f55b5..58b4bf2 100644
--- a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
+++ b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
      * @param[out] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
      * @param[out] batch_splits_out (Optional) The batch splits output tensor. Data types supported: Same as @p scores_in
      * @param[out] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as @p scores_in
-     * @param[in]  keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in
+     * @param[in]  keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32.
      * @param[in]  info             (Optional) BoxNMSLimitInfo information.
      */
     void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
@@ -76,7 +76,7 @@
      * @param[in] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
      * @param[in] batch_splits_out (Optional) The batch splits output tensor. Data types supported: Same as @p scores_in
      * @param[in] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as @p scores_in
-     * @param[in] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in
+     * @param[in] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32.
      * @param[in] info             (Optional) BoxNMSLimitInfo information.
      *
      * @return a status

diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
index 5717201..f2c7ccc 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
index cb74ca9..94248ff 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
index 4ad2fd0..71c44a8 100644
--- a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
+++ b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CPP/functions/CPPPermute.h b/arm_compute/runtime/CPP/functions/CPPPermute.h
index 5a6d8ea..85c1502 100644
--- a/arm_compute/runtime/CPP/functions/CPPPermute.h
+++ b/arm_compute/runtime/CPP/functions/CPPPermute.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/CPP/functions/CPPSplit.h b/arm_compute/runtime/CPP/functions/CPPSplit.h
index 6adcbc3..7929f14 100644
--- a/arm_compute/runtime/CPP/functions/CPPSplit.h
+++ b/arm_compute/runtime/CPP/functions/CPPSplit.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,8 +47,8 @@
     }
     /** Static function to check if given info will lead to a valid configuration of @ref CPPSplit
      *
-     * @param[in] input   The input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in] outputs A vector containing the output tensors' info. Data types supported: Same as @p input.
+     * @param[in] input   The input tensor info. Data types supported: All.
+     * @param[in] outputs A vector containing the output tensors' info. Data types supported: same as @p input.
      *                    The output tensors should match the input tensor dimensions for all shape dimensions apart
      *                    from the split dimension
      * @param[in] axis    Axis on which to split the input.

diff --git a/arm_compute/runtime/CPP/functions/CPPTopKV.h b/arm_compute/runtime/CPP/functions/CPPTopKV.h
index c94e277..2f63084 100644
--- a/arm_compute/runtime/CPP/functions/CPPTopKV.h
+++ b/arm_compute/runtime/CPP/functions/CPPTopKV.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@
     /** Set the input and output of the kernel.
      *
      * @param[in]  predictions A batch_size x classes tensor. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED
-     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: S32
+     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: U32
      * @param[out] output      Computed precision at @p k as a bool 1D tensor. Data types supported: U8
      * @param[in]  k           Number of top elements to look at for computing precision.
      */
@@ -48,7 +48,7 @@
     /** Static function to check if given info will lead to a valid configuration of @ref CPPTopKVKernel
      *
      * @param[in] predictions A batch_size x classes tensor info. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED
-     * @param[in] targets     A batch_size 1D tensor info of class ids. Data types supported: S32
+     * @param[in] targets     A batch_size 1D tensor info of class ids. Data types supported: U32
      * @param[in] output      Computed precision at @p k as a bool 1D tensor info. Data types supported: U8
      * @param[in] k           Number of top elements to look at for computing precision.
      *

diff --git a/arm_compute/runtime/CPP/functions/CPPUpsample.h b/arm_compute/runtime/CPP/functions/CPPUpsample.h
index 3e1852b..b97d4d1 100644
--- a/arm_compute/runtime/CPP/functions/CPPUpsample.h
+++ b/arm_compute/runtime/CPP/functions/CPPUpsample.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,8 +38,8 @@
 public:
     /** Configure the upsample CPP kernel
      *
-     * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED
-     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  input  The input tensor to upsample. Data types supported: All.
+     * @param[out] output The output tensor. Data types supported: same as @p input
      * @param[in]  info   Padding information
      */
     void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);

diff --git a/arm_compute/runtime/CPUUtils.h b/arm_compute/runtime/CPUUtils.h
index 76e48f8..bcc2f66 100644
--- a/arm_compute/runtime/CPUUtils.h
+++ b/arm_compute/runtime/CPUUtils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/DeviceProperties.h b/arm_compute/runtime/DeviceProperties.h
index 03a5fa0..807b53e 100644
--- a/arm_compute/runtime/DeviceProperties.h
+++ b/arm_compute/runtime/DeviceProperties.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/Distribution1D.h b/arm_compute/runtime/Distribution1D.h
index a77e333..5f98f8f 100644
--- a/arm_compute/runtime/Distribution1D.h
+++ b/arm_compute/runtime/Distribution1D.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h
index 3706d08..16d6c34 100644
--- a/arm_compute/runtime/FunctionDescriptors.h
+++ b/arm_compute/runtime/FunctionDescriptors.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h b/arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h
index 248c871..ceb89ed 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
index 4c0d91c..c802662 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/GCHelpers.h b/arm_compute/runtime/GLES_COMPUTE/GCHelpers.h
index 60c2153..99e2768 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCHelpers.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCHelpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/GCMemory.h b/arm_compute/runtime/GLES_COMPUTE/GCMemory.h
index 0c145e1..12a8bce 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCMemory.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCMemory.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h b/arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h
index 90137c3..90fd4c6 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h b/arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h
index 1b7514f..d6e7540 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h b/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h
index 5aac6b8..eed4d3b 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/GCTensor.h b/arm_compute/runtime/GLES_COMPUTE/GCTensor.h
index 0565950..aa31df7 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCTensor.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCTensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h b/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h
index ee9861c..ff1e9b0 100644
--- a/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h
+++ b/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h b/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h
index 245e6f1..f3f7a3a 100644
--- a/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h
+++ b/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h
index 40e0c1a..481fb19 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h
index fc7a0a0..79c7c0c 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h
index 65bbacf..15d957e 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h
index 2d0aff5..d6bc6ee 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h
index eb25dbf..9661b36 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
index 311b3e2..f80ffa2 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h
index e3f2c38..3cf4d54 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
index cb62fef..c206ec4 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h
index 2152616..79af623 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h
index 87bb22d..766e811 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
index 60967e9..a13c74a 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
index 4fc621e..9c1748b 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h
index d0e71b4..67fc86d 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h
index 782052b..b3d6a28 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h
index 8d511ac..2936402 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h
index cd35c73..fd69ef7 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h
index 201e131..bbcc663 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
index b29f808..d6a79b5 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h
index d4295d5..5a610f2 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_GCSCALE_H
 #define ARM_COMPUTE_GCSCALE_H
 
-#include "arm_compute/core/Types.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
 
 #include <cstdint>
@@ -49,8 +49,17 @@
      * @param[in]     use_padding           (Optional) Is padding in use or not. Defaults to true.
      * @param[in]     align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.08)
     void configure(IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
                    SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in,out] input  Source tensor. Data types supported: F16. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output Destination tensor. Data types supported: Same as @p input
+     *                       All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]     info   @ref ScaleKernelInfo descriptor to be used to configure
+     */
+    void configure(IGCTensor *input, IGCTensor *output, const ScaleKernelInfo &info);
 };
 }
 #endif /*ARM_COMPUTE_GCSCALE_H */

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
index 33faae5..4ccfe26 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,16 +50,17 @@
     GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: F16/F32
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     * @param[in]  beta   (Optional) A scaling factor for the exponent. Only beta = 1 is supported
-     * @param[in]  axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in]  input           Source tensor. Data types supported: F16/F32
+     * @param[out] output          Destination tensor. Data types supported: same as @p input
+     * @param[in]  beta            (Optional) A scaling factor for the exponent. Only beta = 1 is supported
+     * @param[in]  reduce_end_axis (Optional) The last axis of the first n dimensions (inclusive)to reduce. Defaults to 0.
+     *                   It has the purpose of squashing together the first n dimensions till (including) the @p reduce_end_axis. For instance, given a [2x3x4x5] image,
+     *                   when @p reduce_end_axis is 1, the reduction will be applied to axes 0 and 1, and the Softmax op will be applied on each of the [2x3] planes of the input image.
+     *                   Must be in range [0, input_num_dimensions).
      *
-     * @note The value of @p axis must be always 1 for GLES
+     * @note The value of @p reduce_end_axis must be always 0 for GLES
      */
-    void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f, size_t axis = 1);
+    void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f, size_t reduce_end_axis = 0);
 
     // Inherited methods overridden:
     void run() override;

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h
index a1ac313..546f6d6 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h
index 633facf..a37031b 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/HOG.h b/arm_compute/runtime/HOG.h
index 6058131..5aa724c 100644
--- a/arm_compute/runtime/HOG.h
+++ b/arm_compute/runtime/HOG.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IAllocator.h b/arm_compute/runtime/IAllocator.h
index c24eb33..3eb8670 100644
--- a/arm_compute/runtime/IAllocator.h
+++ b/arm_compute/runtime/IAllocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IAssetManager.h b/arm_compute/runtime/IAssetManager.h
index 98ad917..b3829e8 100644
--- a/arm_compute/runtime/IAssetManager.h
+++ b/arm_compute/runtime/IAssetManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
index 53ee5b5..265427e 100644
--- a/arm_compute/runtime/IFunction.h
+++ b/arm_compute/runtime/IFunction.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/ILifetimeManager.h b/arm_compute/runtime/ILifetimeManager.h
index 0ff028f..2a0cffa 100644
--- a/arm_compute/runtime/ILifetimeManager.h
+++ b/arm_compute/runtime/ILifetimeManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/ILutAllocator.h b/arm_compute/runtime/ILutAllocator.h
index 2e95cd6..227e822 100644
--- a/arm_compute/runtime/ILutAllocator.h
+++ b/arm_compute/runtime/ILutAllocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IMemory.h b/arm_compute/runtime/IMemory.h
index c4ae2b5..0aeb112 100644
--- a/arm_compute/runtime/IMemory.h
+++ b/arm_compute/runtime/IMemory.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IMemoryGroup.h b/arm_compute/runtime/IMemoryGroup.h
index ef8c835..a977a4a 100644
--- a/arm_compute/runtime/IMemoryGroup.h
+++ b/arm_compute/runtime/IMemoryGroup.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IMemoryManager.h b/arm_compute/runtime/IMemoryManager.h
index ee6837a..4d7d8cd 100644
--- a/arm_compute/runtime/IMemoryManager.h
+++ b/arm_compute/runtime/IMemoryManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IMemoryPool.h b/arm_compute/runtime/IMemoryPool.h
index 86db2ce..b8d36c3 100644
--- a/arm_compute/runtime/IMemoryPool.h
+++ b/arm_compute/runtime/IMemoryPool.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IMemoryRegion.h b/arm_compute/runtime/IMemoryRegion.h
index 1a7250b..914aa57 100644
--- a/arm_compute/runtime/IMemoryRegion.h
+++ b/arm_compute/runtime/IMemoryRegion.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IOperator.h b/arm_compute/runtime/IOperator.h
new file mode 100644
index 0000000..e7952bb
--- /dev/null
+++ b/arm_compute/runtime/IOperator.h

@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_IOPERATOR_H
+#define ARM_COMPUTE_IOPERATOR_H
+
+#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Types.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+/** Base class for all async functions */
+class IOperator
+{
+public:
+    /** Destructor */
+    virtual ~IOperator() = default;
+    /** Run the kernels contained in the function
+     *
+     * @param[in] tensors Vector that contains the tensors to operate on.
+     *
+     */
+    virtual void run(ITensorPack &tensors) = 0;
+    /** Prepare the function for executing
+     *
+     * Any one off pre-processing step required by the function is handled here
+     *
+     * @param[in] constants Vector that contains the constants tensors.
+     *
+     * @note Prepare stage might not need all the function's buffers' backing memory to be available in order to execute
+     */
+    virtual void prepare(ITensorPack &constants) = 0;
+
+    /** Return the memory requirements required by the workspace
+     */
+    virtual MemoryRequirements workspace() const = 0;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_IOPERATOR_H */

diff --git a/arm_compute/runtime/IPoolManager.h b/arm_compute/runtime/IPoolManager.h
index c654f82..481bde5 100644
--- a/arm_compute/runtime/IPoolManager.h
+++ b/arm_compute/runtime/IPoolManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IRuntimeContext.h b/arm_compute/runtime/IRuntimeContext.h
index 44a77f1..4751fc2 100644
--- a/arm_compute/runtime/IRuntimeContext.h
+++ b/arm_compute/runtime/IRuntimeContext.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index a5e20ee..9862753 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,8 @@
 #define ARM_COMPUTE_ISCHEDULER_H
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
 
 #include <functional>
 #include <limits>
@@ -32,6 +34,7 @@
 namespace arm_compute
 {
 class ICPPKernel;
+class ITensor;
 
 /** Scheduler interface to run kernels */
 class IScheduler
@@ -44,6 +47,13 @@
         DYNAMIC, /**< Split the workload dynamically using a bucket system */
     };
 
+    /** Function to be used and map a given thread id to a logical core id
+     *
+     * Mapping function expects the thread index and total number of cores as input,
+     * and returns the logical core index to bind against
+     */
+    using BindFunc = std::function<int(int, int)>;
+
     /** When arm_compute::ISchedular::Hints::_split_dimension is initialized with this value
      * then the schedular is free to break down the problem space over as many dimensions
      * as it wishes
@@ -134,6 +144,13 @@
      */
     virtual void set_num_threads(unsigned int num_threads) = 0;
 
+    /** Sets the number of threads the scheduler will use to run the kernels but also using a binding function to pin the threads to given logical cores
+     *
+     * @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified.
+     * @param[in] func        Binding function to use.
+     */
+    virtual void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func);
+
     /** Returns the number of threads that the SingleThreadScheduler has in his pool.
      *
      * @return Number of threads available in SingleThreadScheduler.
@@ -147,6 +164,14 @@
      */
     virtual void schedule(ICPPKernel *kernel, const Hints &hints) = 0;
 
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] tensors Vector containing the tensors to operate on.
+     */
+    virtual void schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors) = 0;
+
     /** Execute all the passed workloads
      *
      * @note there is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel.

diff --git a/arm_compute/runtime/ISimpleLifetimeManager.h b/arm_compute/runtime/ISimpleLifetimeManager.h
index d71d787..b2d17c6 100644
--- a/arm_compute/runtime/ISimpleLifetimeManager.h
+++ b/arm_compute/runtime/ISimpleLifetimeManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/ITensorAllocator.h b/arm_compute/runtime/ITensorAllocator.h
index 74100f4..e80f7c4 100644
--- a/arm_compute/runtime/ITensorAllocator.h
+++ b/arm_compute/runtime/ITensorAllocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/ITransformWeights.h b/arm_compute/runtime/ITransformWeights.h
index 4981c5f..2e2e764 100644
--- a/arm_compute/runtime/ITransformWeights.h
+++ b/arm_compute/runtime/ITransformWeights.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/IWeightsManager.h b/arm_compute/runtime/IWeightsManager.h
index 8df571b..12aa1da 100644
--- a/arm_compute/runtime/IWeightsManager.h
+++ b/arm_compute/runtime/IWeightsManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/Lut.h b/arm_compute/runtime/Lut.h
index d3e74f3..af18680 100644
--- a/arm_compute/runtime/Lut.h
+++ b/arm_compute/runtime/Lut.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/LutAllocator.h b/arm_compute/runtime/LutAllocator.h
index f84109c..3be8635 100644
--- a/arm_compute/runtime/LutAllocator.h
+++ b/arm_compute/runtime/LutAllocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/Memory.h b/arm_compute/runtime/Memory.h
index d895e06..1eab605 100644
--- a/arm_compute/runtime/Memory.h
+++ b/arm_compute/runtime/Memory.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/MemoryGroup.h b/arm_compute/runtime/MemoryGroup.h
index 4d48495..9fd2b9f 100644
--- a/arm_compute/runtime/MemoryGroup.h
+++ b/arm_compute/runtime/MemoryGroup.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/MemoryManagerOnDemand.h b/arm_compute/runtime/MemoryManagerOnDemand.h
index c56c8c0..50547ac 100644
--- a/arm_compute/runtime/MemoryManagerOnDemand.h
+++ b/arm_compute/runtime/MemoryManagerOnDemand.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/MemoryRegion.h b/arm_compute/runtime/MemoryRegion.h
index 1b645be..63feabd 100644
--- a/arm_compute/runtime/MemoryRegion.h
+++ b/arm_compute/runtime/MemoryRegion.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/MultiHOG.h b/arm_compute/runtime/MultiHOG.h
index 72edf3a..ca94619 100644
--- a/arm_compute/runtime/MultiHOG.h
+++ b/arm_compute/runtime/MultiHOG.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/MultiImage.h b/arm_compute/runtime/MultiImage.h
index 7505ad9..0be91ad 100644
--- a/arm_compute/runtime/MultiImage.h
+++ b/arm_compute/runtime/MultiImage.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/INEOperator.h b/arm_compute/runtime/NEON/INEOperator.h
new file mode 100644
index 0000000..415e767
--- /dev/null
+++ b/arm_compute/runtime/NEON/INEOperator.h

@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_INEOPERATOR_H
+#define ARM_COMPUTE_INEOPERATOR_H
+
+#include "../../core/ITensor.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Types.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+/** Basic interface for functions which have a single async NEON kernel */
+class INEOperator : public IOperator
+{
+public:
+    /** Constructor
+     *
+     * @param[in] ctx Runtime context to be used by the function
+     */
+    INEOperator(IRuntimeContext *ctx = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEOperator(const INEOperator &) = delete;
+    /** Default move constructor */
+    INEOperator(INEOperator &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEOperator &operator=(const INEOperator &) = delete;
+    /** Default move assignment operator */
+    INEOperator &operator=(INEOperator &&) = default;
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &constants) override;
+    MemoryRequirements workspace() const override;
+
+protected:
+    std::unique_ptr<INEKernel> _kernel;
+    IRuntimeContext           *_ctx;
+    MemoryRequirements         _workspace;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_INEOPERATOR_H */

diff --git a/arm_compute/runtime/NEON/INESimpleFunction.h b/arm_compute/runtime/NEON/INESimpleFunction.h
index 8506797..7f2ed2e 100644
--- a/arm_compute/runtime/NEON/INESimpleFunction.h
+++ b/arm_compute/runtime/NEON/INESimpleFunction.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h b/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
index 223048f..7d352eb 100644
--- a/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
+++ b/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index de364fa..763294e 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -105,6 +105,7 @@
 #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
 #include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
+#include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"

diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h
index 54a92bb..13aa51f 100644
--- a/arm_compute/runtime/NEON/NEScheduler.h
+++ b/arm_compute/runtime/NEON/NEScheduler.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h b/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
index 2d6f94c..7b35e6d 100644
--- a/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
+++ b/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEAccumulate.h b/arm_compute/runtime/NEON/functions/NEAccumulate.h
index 0426bf9..f403a77 100644
--- a/arm_compute/runtime/NEON/functions/NEAccumulate.h
+++ b/arm_compute/runtime/NEON/functions/NEAccumulate.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index 95901dc..cfece5c 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
@@ -37,7 +38,7 @@
  *
  * @note The function simulates an activation layer with the specified activation function.
  */
-class NEActivationLayer : public INESimpleFunctionNoBorder
+class NEActivationLayer : public IFunction
 {
 public:
     /** Constructor
@@ -45,14 +46,16 @@
      * @param[in] ctx Runtime context to be used by the function
      */
     NEActivationLayer(IRuntimeContext *ctx = nullptr);
+    /** Destructor */
+    ~NEActivationLayer();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEActivationLayer(const NEActivationLayer &) = delete;
     /** Default move constructor */
-    NEActivationLayer(NEActivationLayer &&) = default;
+    NEActivationLayer(NEActivationLayer &&);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEActivationLayer &operator=(const NEActivationLayer &) = delete;
     /** Default move assignment operator */
-    NEActivationLayer &operator=(NEActivationLayer &&) = default;
+    NEActivationLayer &operator=(NEActivationLayer &&);
     /** [NEActivationLayer snippet] **/
     /** Set the input and output tensor.
      *
@@ -75,6 +78,38 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
+
+namespace experimental
+{
+/** Basic function to run @ref NEActivationLayerKernel */
+class NEActivationLayer : public INEOperator
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @param[in]  input           Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[out] output          Destination tensor info. Data type supported: same as @p input
+     * @param[in]  activation_info Activation layer parameters.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayer
+     *
+     * @param[in] input    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[in] output   Destination tensor info. Data type supported: same as @p input
+     * @param[in] act_info Activation layer information.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+};
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEACTIVATIONLAYER_H */

diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index c50f358..61762f3 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index 6cab5b3..e10771e 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,30 +25,47 @@
 #define ARM_COMPUTE_NEARITHMETICADDITION_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 class ITensor;
 
+namespace experimental
+{
 /** Basic function to run @ref NEArithmeticAdditionKernel */
-class NEArithmeticAddition : public INESimpleFunction
+class NEArithmeticAddition : public INEOperator
 {
 public:
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]  input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[out] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
      * @param[in]  policy   Policy to use to handle overflow.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition
      *
-     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] output   Output tensor. Data types supported: U8/SQASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] output   Output tensor info. Data types supported: U8/SQASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
      * @param[in] policy   Policy to use to handle overflow
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
@@ -56,5 +73,65 @@
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
+} // namespace experimental
+
+/** Basic function to run @ref NEArithmeticAdditionKernel */
+class NEArithmeticAddition : public IFunction
+{
+public:
+    /** Default Constructor */
+    NEArithmeticAddition();
+    /** Default Destructor */
+    ~NEArithmeticAddition();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAddition(const NEArithmeticAddition &) = delete;
+    /** Default move constructor */
+    NEArithmeticAddition(NEArithmeticAddition &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAddition &operator=(const NEArithmeticAddition &) = delete;
+    /** Default move assignment operator */
+    NEArithmeticAddition &operator=(NEArithmeticAddition &&);
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  policy   Policy to use to handle overflow.
+     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] output   Output tensor info. Data types supported: U8/SQASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] policy   Policy to use to handle overflow
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEARITHMETICADDITION_H */

diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index 69d7b4b..a38335c 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,35 +25,61 @@
 #define ARM_COMPUTE_NEARITHMETICSUBTRACTION_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 class ITensor;
 
+namespace experimental
+{
 /** Basic function to run @ref NEArithmeticSubtractionKernel
  *
  * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/F32.
  * @note The function performs an arithmetic subtraction between two tensors.
  *
  *  This function calls the following kernels:
- * -# @ref NEFillBorderKernel (In case of broadcasting, in the input being broadcasted)
  * -# @ref NEArithmeticSubtractionKernel
  */
-class NEArithmeticSubtraction : public INESimpleFunction
+class NEArithmeticSubtraction : public INEOperator
 {
 public:
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                          -> U8
+     *   - (U8,U8)                          -> S16
+     *   - (QASYMM8, QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (S16,U8)                         -> S16
+     *   - (U8,S16)                         -> S16
+     *   - (S16,S16)                        -> S16
+     *   - (F16,F16)                        -> F16
+     *   - (F32,F32)                        -> F32
+     *
+     * @param[in]  input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[in]  input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[out] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
      *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                          -> U8
+     *   - (U8,U8)                          -> S16
+     *   - (QASYMM8, QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (S16,U8)                         -> S16
+     *   - (U8,S16)                         -> S16
+     *   - (S16,S16)                        -> S16
+     *   - (F16,F16)                        -> F16
+     *   - (F32,F32)                        -> F32
+     *
      * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
      * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
      * @param[in] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
@@ -64,5 +90,58 @@
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
-}
+} // namespace experimental
+
+/** Basic function to run @ref NEArithmeticSubtractionKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/F32.
+ * @note The function performs an arithmetic subtraction between two tensors.
+ *
+ *  This function calls the following kernels:
+ * -# @ref NEArithmeticSubtractionKernel
+ */
+class NEArithmeticSubtraction : public IFunction
+{
+public:
+    /** Default Constructor */
+    NEArithmeticSubtraction();
+    /** Default Destructor */
+    ~NEArithmeticSubtraction();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtraction(const NEArithmeticSubtraction &) = delete;
+    /** Default move constructor */
+    NEArithmeticSubtraction(NEArithmeticSubtraction &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtraction &operator=(const NEArithmeticSubtraction &) = delete;
+    /** Default move assignment operator */
+    NEArithmeticSubtraction &operator=(NEArithmeticSubtraction &&);
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
+     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
+     *
+     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTION_H */

diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
index 14416e7..1f77164 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
index 2a62530..1a6ffa9 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
index c254c30..c612a14 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
index 15e1250..f6ef975 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
index 0e62620..8fc4b0d 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
index 1dcc6e2..20e23af 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
index 27c1c51..14d5de4 100644
--- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
+++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEBox3x3.h b/arm_compute/runtime/NEON/functions/NEBox3x3.h
index c382ea9..80cd508 100644
--- a/arm_compute/runtime/NEON/functions/NEBox3x3.h
+++ b/arm_compute/runtime/NEON/functions/NEBox3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NECannyEdge.h b/arm_compute/runtime/NEON/functions/NECannyEdge.h
index 84cc2de..f171c3b 100644
--- a/arm_compute/runtime/NEON/functions/NECannyEdge.h
+++ b/arm_compute/runtime/NEON/functions/NECannyEdge.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h
index 55c21a0..ca818be 100644
--- a/arm_compute/runtime/NEON/functions/NECast.h
+++ b/arm_compute/runtime/NEON/functions/NECast.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,23 +44,23 @@
      * Valid conversions Input -> Output :
      *
      *   - QASYMM8_SIGNED -> S16, S32, F32, F16
-     *   - QASYMM8 -> U16, S16, S32, F32, F16
-     *   - U8 -> U16, S16, S32, F32, F16
-     *   - U16 -> U8, U32
-     *   - S16 -> QASYMM8_SIGNED, U8, S32
-     *   - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
-     *   - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
-     *   - F32 -> QASYMM8_SIGNED, QASYMM8, F16, S32, U8
+     *   - QASYMM8        -> U16, S16, S32, F32, F16
+     *   - U8             -> U16, S16, S32, F32, F16
+     *   - U16            -> U8, U32
+     *   - S16            -> QASYMM8_SIGNED, U8, S32
+     *   - F16            -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
+     *   - S32            -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
+     *   - F32            -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
      *
      * @param[in]  input  The input tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/F16/S32/F32.
-     * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/BFLOAT16/F16/F32.
      * @param[in]  policy Conversion policy.
      */
     void configure(ITensor *input, ITensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NECast
      *
      * @param[in] input  Source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/F16/S32/F32.
-     * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/BFLOAT16/F16/F32.
      * @param[in] policy Conversion policy.
      *
      * @return a status

diff --git a/arm_compute/runtime/NEON/functions/NEChannelCombine.h b/arm_compute/runtime/NEON/functions/NEChannelCombine.h
index ba15916..c4ced62 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelCombine.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelCombine.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEChannelExtract.h b/arm_compute/runtime/NEON/functions/NEChannelExtract.h
index 96ba1c8..54059e9 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelExtract.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelExtract.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
index 716518a..f31518e 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NECol2Im.h b/arm_compute/runtime/NEON/functions/NECol2Im.h
index 5da0b91..e03ec42 100644
--- a/arm_compute/runtime/NEON/functions/NECol2Im.h
+++ b/arm_compute/runtime/NEON/functions/NECol2Im.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEColorConvert.h b/arm_compute/runtime/NEON/functions/NEColorConvert.h
index ee76db2..b4c4158 100644
--- a/arm_compute/runtime/NEON/functions/NEColorConvert.h
+++ b/arm_compute/runtime/NEON/functions/NEColorConvert.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h b/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h
index 09c0c9d..44f3f86 100644
--- a/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h
+++ b/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
index 8207589..1d703ae 100644
--- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Requires.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 #include <memory>
 #include <vector>
@@ -52,6 +53,16 @@
 public:
     /** Default constructor */
     NEConcatenateLayer();
+    /** Destructor */
+    ~NEConcatenateLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConcatenateLayer(const NEConcatenateLayer &) = delete;
+    /** Default move constructor */
+    NEConcatenateLayer(NEConcatenateLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConcatenateLayer &operator=(const NEConcatenateLayer &) = delete;
+    /** Default move assignment operator */
+    NEConcatenateLayer &operator=(NEConcatenateLayer &&);
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
@@ -61,7 +72,6 @@
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
      * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(std::vector<ITensor *> inputs_vector, ITensor *output, size_t axis);
     void configure(std::vector<const ITensor *> inputs_vector, ITensor *output, size_t axis);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConcatenateLayer
      *
@@ -74,23 +84,61 @@
      *
      * @return a status
      */
-    static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
     static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    template <typename TensorType, REQUIRES_TA(std::is_same<typename std::remove_cv<TensorType>::type, ITensor>::value)>
-    void configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output, size_t axis);
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
 
-    template <typename TensorInfoType, REQUIRES_TA(std::is_same<typename std::remove_cv<TensorInfoType>::type, ITensorInfo>::value)>
-    static Status validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis);
+namespace experimental
+{
+/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
+ *
+ * -# @ref NEWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
+ * -# @ref NEHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
+ * -# @ref NEDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
+ * -# @ref NEBatchConcatenateLayerKernel (if underlying concatenation axis is 3).
+ */
+class NEConcatenation : public INEOperator
+{
+public:
+    /** Default constructor */
+    NEConcatenation();
+    /** Initialise the kernel's inputs vector and output.
+     *
+     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref NEWidthConcatenateLayerKernel, @ref NEHeightConcatenateLayerKernel and @ref NEDepthConcatenateLayerKernel.
+     *
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out]    output        Output tensor. Data types supported: Same as @p input.
+     * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     */
+    void configure(const std::vector<const ITensorInfo *> &inputs_vector, ITensorInfo *output, size_t axis);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEConcatenateLayer
+     *
+     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref NEWidthConcatenateLayerKernel, @ref NEHeightConcatenateLayerKernel and @ref NEDepthConcatenateLayerKernel.
+     *
+     * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
+     * @param[in] axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     *
+     * @return a status
+     */
+    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
 
 private:
     std::vector<std::unique_ptr<INEKernel>> _concat_kernels;
     unsigned int                            _num_inputs;
     unsigned int                            _axis;
 };
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NECONCATENATELAYER_H */

diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
index 42f7870..42a62dc 100644
--- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEConvolution.h b/arm_compute/runtime/NEON/functions/NEConvolution.h
index c297589..eb16a45 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolution.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolution.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index b76695b..e8b425b 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NECopy.h b/arm_compute/runtime/NEON/functions/NECopy.h
index b03f408..df1a498 100644
--- a/arm_compute/runtime/NEON/functions/NECopy.h
+++ b/arm_compute/runtime/NEON/functions/NECopy.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NECropResize.h b/arm_compute/runtime/NEON/functions/NECropResize.h
index 9f961a1..361c236 100644
--- a/arm_compute/runtime/NEON/functions/NECropResize.h
+++ b/arm_compute/runtime/NEON/functions/NECropResize.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,6 @@
 #include "arm_compute/core/NEON/kernels/NECropKernel.h"
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
 
-#include <cstdint>
 #include <memory>
 
 namespace arm_compute
@@ -58,7 +57,7 @@
      * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used.
      * @note Start and end indices of boxes are inclusive.
      *
-     * @param[in]  input               Source tensor containing N batches of 3D images to be cropped. Data type supported: U16/S16/U32/S32/F16/F32
+     * @param[in]  input               Source tensor containing N batches of 3D images to be cropped. Data type supported: U8/U16/S16/U32/S32/F16/F32
      * @param[in]  boxes               Tensor containing the boxes used to crop the images. Data type supported: F32
      * @param[in]  box_ind             One dimensional tensor containing the batch index of the 3D image in @p input that the corresponding
      *                                 box in @p boxes will be applied to. Data type supported: F32
@@ -76,7 +75,7 @@
      * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used.
      * @note Start and end indices of boxes are inclusive.
      *
-     * @param[in] input               Source tensor containing N batches of 3D images to be cropped. Data type supported: U16/S16/U32/S32/F16/F32
+     * @param[in] input               Source tensor containing N batches of 3D images to be cropped. Data type supported: U8/U16/S16/U32/S32/F16/F32
      * @param[in] boxes               Tensor info for the tensor containing the boxes used to crop the images. Data type supported: F32
      * @param[in] box_ind             Tensor info for the one dimensional tensor containing the batch index of the 3D image in @p input
      *                                that the corresponding box in @p boxes will be applied to. Data type supported: F32

diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index e2ed0e0..378fce7 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
index b784480..89f3958 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index 3c21d1a..22bbd6e 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 811dc82..116ac16 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
index f8d0ce8..77295bc 100644
--- a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,13 +39,13 @@
 public:
     /** Configure the kernel.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDequantizationLayer
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data type supported: F16/F32.
      *
      * @return a status

diff --git a/arm_compute/runtime/NEON/functions/NEDerivative.h b/arm_compute/runtime/NEON/functions/NEDerivative.h
index 65d0654..8eb2142 100644
--- a/arm_compute/runtime/NEON/functions/NEDerivative.h
+++ b/arm_compute/runtime/NEON/functions/NEDerivative.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
index d616762..e0431b2 100644
--- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,9 +55,9 @@
     NEDetectionPostProcessLayer &operator=(const NEDetectionPostProcessLayer &) = delete;
     /** Configure the detection output layer NE function
      *
-     * @param[in]  input_box_encoding The bounding box input tensor. Data types supported: F32, QASYMM8.
-     * @param[in]  input_score        The class prediction input tensor. Data types supported: Same as @p input_box_encoding.
-     * @param[in]  input_anchors      The anchors input tensor. Data types supported: Same as @p input_box_encoding.
+     * @param[in]  input_box_encoding The bounding box input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
+     * @param[in]  input_score        The class prediction input tensor. Data types supported: same as @p input_box_encoding.
+     * @param[in]  input_anchors      The anchors input tensor. Data types supported: same as @p input_box_encoding.
      * @param[out] output_boxes       The boxes output tensor. Data types supported: F32.
      * @param[out] output_classes     The classes output tensor. Data types supported: Same as @p output_boxes.
      * @param[out] output_scores      The scores output tensor. Data types supported: Same as @p output_boxes.
@@ -70,14 +70,14 @@
                    ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDetectionPostProcessLayer
      *
-     * @param[in]  input_box_encoding The bounding box input tensor info. Data types supported: F32, QASYMM8.
-     * @param[in]  input_class_score  The class prediction input tensor info. Data types supported: F32, QASYMM8.
-     * @param[in]  input_anchors      The anchors input tensor. Data types supported: F32, QASYMM8.
-     * @param[out] output_boxes       The output tensor. Data types supported: F32.
-     * @param[out] output_classes     The output tensor. Data types supported: Same as @p output_boxes.
-     * @param[out] output_scores      The output tensor. Data types supported: Same as @p output_boxes.
-     * @param[out] num_detection      The number of output detection. Data types supported: Same as @p output_boxes.
-     * @param[in]  info               (Optional) DetectionPostProcessLayerInfo information.
+     * @param[in] input_box_encoding The bounding box input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
+     * @param[in] input_class_score  The class prediction input tensor info. Data types supported: same as @p input_box_encoding.
+     * @param[in] input_anchors      The anchors input tensor info. Data types supported: same as @p input_box_encoding.
+     * @param[in] output_boxes       The output tensor info. Data types supported: F32.
+     * @param[in] output_classes     The output tensor info. Data types supported: Same as @p output_boxes.
+     * @param[in] output_scores      The output tensor info. Data types supported: Same as @p output_boxes.
+     * @param[in] num_detection      The number of output detection tensor info. Data types supported: Same as @p output_boxes.
+     * @param[in] info               (Optional) DetectionPostProcessLayerInfo information.
      *
      * @return a status
      */

diff --git a/arm_compute/runtime/NEON/functions/NEDilate.h b/arm_compute/runtime/NEON/functions/NEDilate.h
index 39a37af..6dae2c7 100644
--- a/arm_compute/runtime/NEON/functions/NEDilate.h
+++ b/arm_compute/runtime/NEON/functions/NEDilate.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 68454be..9b18f64 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
index cac105c..7d9dac7 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,8 @@
 #define ARM_COMPUTE_NEELEMENTWISEOPERATIONS_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
@@ -36,9 +37,21 @@
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a max operation between two tensors.
  */
-class NEElementwiseMax : public INESimpleFunction
+class NEElementwiseMax : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseMax();
+    /** Default Destructor */
+    ~NEElementwiseMax();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseMax(const NEElementwiseMax &) = delete;
+    /** Default move constructor */
+    NEElementwiseMax(NEElementwiseMax &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseMax &operator=(const NEElementwiseMax &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseMax &operator=(NEElementwiseMax &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1   First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -57,6 +70,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref NEArithmeticOperationKernel for min
@@ -64,9 +84,21 @@
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a min operation between two tensors.
  */
-class NEElementwiseMin : public INESimpleFunction
+class NEElementwiseMin : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseMin();
+    /** Default Destructor */
+    ~NEElementwiseMin();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseMin(const NEElementwiseMin &) = delete;
+    /** Default move constructor */
+    NEElementwiseMin(NEElementwiseMin &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseMin &operator=(const NEElementwiseMin &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseMin &operator=(NEElementwiseMin &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1   First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -85,6 +117,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref NEArithmeticOperationKernel for squared difference
@@ -92,9 +131,21 @@
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
  */
-class NEElementwiseSquaredDiff : public INESimpleFunction
+class NEElementwiseSquaredDiff : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseSquaredDiff();
+    /** Default Destructor */
+    ~NEElementwiseSquaredDiff();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseSquaredDiff(const NEElementwiseSquaredDiff &) = delete;
+    /** Default move constructor */
+    NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseSquaredDiff &operator=(const NEElementwiseSquaredDiff &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseSquaredDiff &operator=(NEElementwiseSquaredDiff &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1   First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -113,6 +164,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref NEArithmeticOperationKernel for division
@@ -120,9 +178,21 @@
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs a squared different operation between two tensors (i.e., out[i] = in1[i] / in2[i])
  */
-class NEElementwiseDivision : public INESimpleFunction
+class NEElementwiseDivision : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseDivision();
+    /** Default Destructor */
+    ~NEElementwiseDivision();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseDivision(const NEElementwiseDivision &) = delete;
+    /** Default move constructor */
+    NEElementwiseDivision(NEElementwiseDivision &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseDivision &operator=(const NEElementwiseDivision &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseDivision &operator=(NEElementwiseDivision &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
@@ -141,6 +211,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref NEArithmeticOperationKernel for power
@@ -149,9 +226,21 @@
  * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
  * @note For an exponent that is a float, this function will only work with a positive base.
  */
-class NEElementwisePower : public INESimpleFunction
+class NEElementwisePower : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwisePower();
+    /** Default Destructor */
+    ~NEElementwisePower();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwisePower(const NEElementwisePower &) = delete;
+    /** Default move constructor */
+    NEElementwisePower(NEElementwisePower &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwisePower &operator=(const NEElementwisePower &) = delete;
+    /** Default move assignment operator */
+    NEElementwisePower &operator=(NEElementwisePower &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
@@ -170,6 +259,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref NEComparisonOperationKernel.
@@ -177,17 +273,256 @@
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a comparison operation between two tensors.
  */
-class NEElementwiseComparison : public INESimpleFunction
+class NEElementwiseComparison : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseComparison();
+    /** Default Destructor */
+    ~NEElementwiseComparison();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseComparison(const NEElementwiseComparison &) = delete;
+    /** Default move constructor */
+    NEElementwiseComparison(NEElementwiseComparison &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseComparison &operator=(const NEElementwiseComparison &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseComparison &operator=(NEElementwiseComparison &&);
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor. Data types supported: U8.
+     * @param[in]      op     Comparison Operation to be performed.
+     */
+    void configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: U8.
+     * @param[in] op     Comparison Operation to be performed.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to run @ref NEComparisonOperationKernel
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a comparison operation between two tensors.
+ */
+template <ComparisonOperation op>
+class NEElementwiseComparisonStatic : public IFunction
+{
+public:
+    /** Default Constructor */
+    NEElementwiseComparisonStatic();
+    /** Default Destructor */
+    ~NEElementwiseComparisonStatic();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseComparisonStatic(const NEElementwiseComparisonStatic &) = delete;
+    /** Default move constructor */
+    NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseComparisonStatic &operator=(const NEElementwiseComparisonStatic &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseComparisonStatic &operator=(NEElementwiseComparisonStatic &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
      * @param[out]     output Output tensor. Data types supported: U16/U32.
+     */
+    void configure(ITensor *input1, ITensor *input2, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: U16/U32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to run equal comparison. */
+using NEEqual = NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
+/** Basic function to run not equal comparison. */
+using NENotEqual = NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+/** Basic function to run greater comparison. */
+using NEGreater = NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
+/** Basic function to run greater-equal comparison. */
+using NEGreaterEqual = NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+/** Basic function to run less comparison. */
+using NELess = NEElementwiseComparisonStatic<ComparisonOperation::Less>;
+/** Basic function to run less-equal comparison. */
+using NELessEqual = NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+
+namespace experimental
+{
+/** Basic function to run @ref NEArithmeticOperationKernel for max
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class NEElementwiseMax : public INEOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for max
+     *
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run @ref NEArithmeticOperationKernel for min
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a min operation between two tensors.
+ */
+class NEElementwiseMin : public INEOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for min
+     *
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run @ref NEArithmeticOperationKernel for squared difference
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
+ */
+class NEElementwiseSquaredDiff : public INEOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for squared difference
+     *
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run @ref NEArithmeticOperationKernel for division
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs a squared different operation between two tensors (i.e., out[i] = in1[i] / in2[i])
+ */
+class NEElementwiseDivision : public INEOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for division
+     *
+     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run @ref NEArithmeticOperationKernel for power
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
+ * @note For an exponent that is a float, this function will only work with a positive base.
+ */
+class NEElementwisePower : public INEOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for power
+     *
+     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run @ref NEComparisonOperationKernel.
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a comparison operation between two tensors.
+ */
+class NEElementwiseComparison : public INEOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: U16/U32.
      * @param[in]      op     Comparison Operation to be performed.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op);
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ComparisonOperation op);
     /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -206,16 +541,16 @@
  * @note The function performs a comparison operation between two tensors.
  */
 template <ComparisonOperation op>
-class NEElementwiseComparisonStatic : public INESimpleFunction
+class NEElementwiseComparisonStatic : public INEOperator
 {
 public:
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor. Data types supported: U16/U32.
+     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: U16/U32.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output);
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -239,5 +574,6 @@
 using NELess = NEElementwiseComparisonStatic<ComparisonOperation::Less>;
 /** Basic function to run less-equal comparison. */
 using NELessEqual = NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEELEMENTWISEOPERATIONS_H */

diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
index 094f875..8b33018 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,14 @@
 #ifndef ARM_COMPUTE_NEELEMENTWISEUNARYLAYER_H
 #define ARM_COMPUTE_NEELEMENTWISEUNARYLAYER_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
 
 /** Basic function to perform inverse square root on an input tensor. */
-class NERsqrtLayer : public INESimpleFunction
+class NERsqrtLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialize the function
@@ -51,7 +51,7 @@
 };
 
 /** Basic function to perform exponential on an input tensor. */
-class NEExpLayer : public INESimpleFunction
+class NEExpLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialize the function
@@ -71,7 +71,7 @@
 };
 
 /** Basic function to negate an input tensor. */
-class NENegLayer : public INESimpleFunction
+class NENegLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialize the function
@@ -91,18 +91,18 @@
 };
 
 /** Basic function to compute the natural logarithm of an input tensor. */
-class NELogLayer : public INESimpleFunction
+class NELogLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialize the function
      *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32/S32.
+     * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NELogLayer
      *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
+     * @param[in] input  First tensor input info. Data types supported: F16/F32.
      * @param[in] output Output tensor info. Data types supported: Same as @p input.
      *
      * @return a status
@@ -111,7 +111,7 @@
 };
 
 /** Basic function to compute the absolute value of an input tensor. */
-class NEAbsLayer : public INESimpleFunction
+class NEAbsLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialize the function
@@ -131,7 +131,7 @@
 };
 
 /** Basic function to compute the round value elementwise of an input tensor. */
-class NERoundLayer : public INESimpleFunction
+class NERoundLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialize the function
@@ -151,7 +151,7 @@
 };
 
 /** Basic function to compute the sine of an input tensor. */
-class NESinLayer : public INESimpleFunction
+class NESinLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialize the function

diff --git a/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h b/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
index e9d58f3..5c0c323 100644
--- a/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
+++ b/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEErode.h b/arm_compute/runtime/NEON/functions/NEErode.h
index 1d6ea42..3e84c2b 100644
--- a/arm_compute/runtime/NEON/functions/NEErode.h
+++ b/arm_compute/runtime/NEON/functions/NEErode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
index c710b93..312b46b 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT1D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h
index e25ebb9..efcce2e 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT2D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
index 23df459..dd57900 100644
--- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEFastCorners.h b/arm_compute/runtime/NEON/functions/NEFastCorners.h
index e2decb1..cc69e77 100644
--- a/arm_compute/runtime/NEON/functions/NEFastCorners.h
+++ b/arm_compute/runtime/NEON/functions/NEFastCorners.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEFill.h b/arm_compute/runtime/NEON/functions/NEFill.h
index f8a1507..1c3c546 100644
--- a/arm_compute/runtime/NEON/functions/NEFill.h
+++ b/arm_compute/runtime/NEON/functions/NEFill.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
index 0ae04cb..3ac23be 100644
--- a/arm_compute/runtime/NEON/functions/NEFillBorder.h
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEFlattenLayer.h b/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
index 7b4801c..73da254 100644
--- a/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFlattenLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEFloor.h b/arm_compute/runtime/NEON/functions/NEFloor.h
index a11907b..12f0ee2 100644
--- a/arm_compute/runtime/NEON/functions/NEFloor.h
+++ b/arm_compute/runtime/NEON/functions/NEFloor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index b14650c..21df3c4 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
index b3b41c5..6b56135 100644
--- a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
+++ b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 8dc6b88..8d65fb5 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEGEMM_H
 #define ARM_COMPUTE_NEGEMM_H
 
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
@@ -35,6 +34,7 @@
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -112,7 +112,7 @@
     NEGEMMAssemblyDispatch     _asm_glue;
     NEGEMMMatrixAdditionKernel _ma_kernel;
     NEActivationLayer          _alpha_scale_func;
-    NEArithmeticAdditionKernel _add_bias_kernel;
+    NEArithmeticAddition       _add_bias;
     NEActivationLayer          _activation_func;
 
     Tensor         _tmp_a;

diff --git a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
index ae0ae44..a82d44f 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,15 +24,12 @@
 #ifndef ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H
 #define ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H
 
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
-
 namespace arm_compute
 {
 /** Assembly kernel glue */

diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index e7da100..b3f5c51 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,21 +64,21 @@
     /** Set the input and output tensors.
      *
      * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                     Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     *                     Data type supported: All.
      * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                     Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BFLOAT16
-     * @param[out] output  Destination tensor.
-     *                     Data types supported: Same as @p weights, FP32 if @p weights is BFLOAT16
+     *                     Data type supported: same as @p weights.
+     *                     @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[out] output  Destination tensor. Data types supported: same as @p weights.
      */
     void configure(const ITensor *weights, const ITensor *biases, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights
      *
      * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                    Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     *                    Data type supported: All.
      * @param[in] biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                    Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BFLOAT16
-     * @param[in] output  Destination tensor.
-     *                    Data types supported: Same as @p weights FP32 if @p weights is BFLOAT16
+     *                    Data type supported: same as @p weights.
+     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[in] output  Destination tensor. Data types supported: same as @p weights.
      *
      * @return an error status
      */

diff --git a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
index 10d9c37..58cb383 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
index a8ce1e5..9813b34 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 11683c5..01720f0 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index cbdc788..f29d5d4 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,59 +38,6 @@
 {
 class ITensor;
 
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8Scale on NEON.
- *
- *  NEGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
- *  The final result is:
- *
- *  ((input[i][k] + result_offset) * result_mult_int) >> result_shift
- *
- * In case the bias tensor is provided, the final result is:
- *
- *  ((input[i][k] + bias[k] + result_offset) * result_mult_int) >> result_shift
- *
- *  This function calls the following NEON kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToUint8Scale : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input           Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_offset   Offset to be added to each element of the input matrix
-     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
-     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8Scale
-     *
-     * @param[in] input  Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
 /** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on NEON.
  *
  *  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:

diff --git a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
index 6a38490..983c95d 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEGather.h b/arm_compute/runtime/NEON/functions/NEGather.h
index 7ed45c0..b872c44 100644
--- a/arm_compute/runtime/NEON/functions/NEGather.h
+++ b/arm_compute/runtime/NEON/functions/NEGather.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEGaussian3x3.h b/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
index 4573666..54fe91b 100644
--- a/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
+++ b/arm_compute/runtime/NEON/functions/NEGaussian3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
index 8475301..2e042e2 100644
--- a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
+++ b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
index a6b2127..d82f763 100644
--- a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
+++ b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index 7c470fb..f937832 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,18 +23,19 @@
  */
 #ifndef ARM_COMPUTE_NEGENERATEPROPOSALSLAYER_H
 #define ARM_COMPUTE_NEGENERATEPROPOSALSLAYER_H
+
 #include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 #include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
 #include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -112,9 +113,9 @@
 
     // Neon kernels
     NEPermuteKernel              _permute_deltas_kernel;
-    NEReshapeLayerKernel         _flatten_deltas_kernel;
+    NEReshapeLayer               _flatten_deltas;
     NEPermuteKernel              _permute_scores_kernel;
-    NEReshapeLayerKernel         _flatten_scores_kernel;
+    NEReshapeLayer               _flatten_scores;
     NEComputeAllAnchorsKernel    _compute_anchors_kernel;
     NEBoundingBoxTransformKernel _bounding_box_kernel;
     NEPadLayerKernel             _pad_kernel;

diff --git a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
index f0f46ce..9b6fc47 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEHOGDetector.h b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
index c0bd3da..6400d3c 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGDetector.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGDetector.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEHOGGradient.h b/arm_compute/runtime/NEON/functions/NEHOGGradient.h
index f8c3827..2d3f934 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGGradient.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGGradient.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
index 3840b9c..ff64afb 100644
--- a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
+++ b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEHarrisCorners.h b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
index caf887d..c086e3a 100644
--- a/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
+++ b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEHistogram.h b/arm_compute/runtime/NEON/functions/NEHistogram.h
index e1a5e42..716f2e7 100644
--- a/arm_compute/runtime/NEON/functions/NEHistogram.h
+++ b/arm_compute/runtime/NEON/functions/NEHistogram.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEIm2Col.h b/arm_compute/runtime/NEON/functions/NEIm2Col.h
index cb905a3..3ea9c1c 100644
--- a/arm_compute/runtime/NEON/functions/NEIm2Col.h
+++ b/arm_compute/runtime/NEON/functions/NEIm2Col.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@
      * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
      *                         while every optional dimension from 4 and above represent a batch of inputs.
      *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                         Note: QASYMM8 works only for has_bias = false
+     *                         Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
      * @param[out] output      The output tensor. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
      * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
@@ -61,7 +61,7 @@
      * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
      *                        while every optional dimension from 4 and above represent a batch of inputs.
      *                        Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                        Note: QASYMM8 works only for has_bias = false
+     *                        Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
      * @param[in] output      The output tensor. Data types supported: Same as @p input
      * @param[in] kernel_dims The kernel dimensions (width and height).
      * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.

diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
index e128ec5..85a307c 100644
--- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEIntegralImage.h b/arm_compute/runtime/NEON/functions/NEIntegralImage.h
index 2d7669d..6302a7a 100644
--- a/arm_compute/runtime/NEON/functions/NEIntegralImage.h
+++ b/arm_compute/runtime/NEON/functions/NEIntegralImage.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index a581600..31e0c61 100644
--- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index e85e87b..4a47dfb 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,17 +25,17 @@
 #define ARM_COMPUTE_NELSTMLAYER_H
 
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 #include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
@@ -146,89 +146,89 @@
     void prepare() override;
 
 private:
-    MemoryGroup                     _memory_group;
-    NEFullyConnectedLayer           _fully_connected_input_gate;
-    NEArithmeticAddition            _accum_input_gate1;
-    NEArithmeticSubtractionKernel   _subtract_input_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate;
-    NEActivationLayerKernel         _activation_input_gate;
-    NEFullyConnectedLayer           _fully_connected_forget_gate;
-    NEArithmeticAddition            _accum_forget_gate1;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate;
-    NEActivationLayerKernel         _activation_forget_gate;
-    NEFullyConnectedLayer           _fully_connected_cell_state;
-    NEGEMM                          _gemm_cell_state1;
-    NETransposeKernel               _transpose_cell_state;
-    NEArithmeticAdditionKernel      _accum_cell_state1;
-    NEArithmeticAdditionKernel      _accum_cell_state2;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state1;
-    NEActivationLayerKernel         _activation_cell_state;
-    NEActivationLayerKernel         _cell_clip;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state2;
-    NEFullyConnectedLayer           _fully_connected_output;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_output_state1;
-    NEArithmeticAddition            _accum_output1;
-    NEActivationLayerKernel         _activation_output;
-    NEActivationLayerKernel         _activation_output_state;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_output_state2;
-    NEFullyConnectedLayer           _fully_connected_output_state;
-    NEActivationLayerKernel         _projection_clip;
-    NECopyKernel                    _copy_cell_state;
-    NECopyKernel                    _copy_output;
-    NEConcatenateLayer              _concat_scratch_buffer;
-    NEConcatenateLayer              _concat_inputs_forget_gate;
-    NEConcatenateLayer              _concat_weights_forget_gate;
-    NEConcatenateLayer              _concat_weights_input_gate;
-    NEConcatenateLayer              _concat_weights_output;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_input_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_input_gate_bias;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_forget_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_forget_gate_bias;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_cell_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_cell_gate_bias;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_output_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_output_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_output_gate_bias;
-    Tensor                          _input_gate_out1;
-    Tensor                          _input_gate_out2;
-    Tensor                          _input_gate_out3;
-    Tensor                          _input_gate_out4;
-    Tensor                          _forget_gate_out1;
-    Tensor                          _forget_gate_out2;
-    Tensor                          _forget_gate_out3;
-    Tensor                          _forget_gate_out4;
-    Tensor                          _forget_gate_out5;
-    Tensor                          _forget_gate_out6;
-    Tensor                          _cell_state_out1;
-    Tensor                          _cell_state_out2;
-    Tensor                          _cell_state_out3;
-    Tensor                          _cell_state_out4;
-    Tensor                          _cell_state_out5;
-    Tensor                          _output1;
-    Tensor                          _output2;
-    Tensor                          _output3;
-    Tensor                          _output4;
-    Tensor                          _cell_state_activation;
-    Tensor                          _output_state1;
-    Tensor                          _ones;
-    Tensor                          _input_layer_norm_out1;
-    Tensor                          _input_layer_norm_out2;
-    Tensor                          _forget_layer_norm_out1;
-    Tensor                          _forget_layer_norm_out2;
-    Tensor                          _cell_layer_norm_out1;
-    Tensor                          _cell_layer_norm_out2;
-    Tensor                          _output_layer_norm_out1;
-    Tensor                          _output_layer_norm_out2;
-    bool                            _run_peephole_opt;
-    bool                            _run_cifg_opt;
-    bool                            _perform_cell_clipping;
-    bool                            _has_projection_weights;
-    bool                            _perform_projection_clipping;
-    bool                            _is_prepared;
-    bool                            _is_layer_norm_lstm;
+    MemoryGroup                    _memory_group;
+    NEFullyConnectedLayer          _fully_connected_input_gate;
+    NEArithmeticAddition           _accum_input_gate1;
+    NEArithmeticSubtraction        _subtract_input_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_input_gate;
+    NEActivationLayer              _activation_input_gate;
+    NEFullyConnectedLayer          _fully_connected_forget_gate;
+    NEArithmeticAddition           _accum_forget_gate1;
+    NEPixelWiseMultiplication      _pixelwise_mul_forget_gate;
+    NEActivationLayer              _activation_forget_gate;
+    NEFullyConnectedLayer          _fully_connected_cell_state;
+    NEGEMM                         _gemm_cell_state1;
+    NETransposeKernel              _transpose_cell_state;
+    NEArithmeticAddition           _accum_cell_state1;
+    NEArithmeticAddition           _accum_cell_state2;
+    NEPixelWiseMultiplication      _pixelwise_mul_cell_state1;
+    NEActivationLayer              _activation_cell_state;
+    NEActivationLayer              _cell_clip;
+    NEPixelWiseMultiplication      _pixelwise_mul_cell_state2;
+    NEFullyConnectedLayer          _fully_connected_output;
+    NEPixelWiseMultiplication      _pixelwise_mul_output_state1;
+    NEArithmeticAddition           _accum_output1;
+    NEActivationLayer              _activation_output;
+    NEActivationLayer              _activation_output_state;
+    NEPixelWiseMultiplication      _pixelwise_mul_output_state2;
+    NEFullyConnectedLayer          _fully_connected_output_state;
+    NEActivationLayer              _projection_clip;
+    NECopyKernel                   _copy_cell_state;
+    NECopyKernel                   _copy_output;
+    NEConcatenateLayer             _concat_scratch_buffer;
+    NEConcatenateLayer             _concat_inputs_forget_gate;
+    NEConcatenateLayer             _concat_weights_forget_gate;
+    NEConcatenateLayer             _concat_weights_input_gate;
+    NEConcatenateLayer             _concat_weights_output;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_input_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_input_gate_coeff;
+    NEArithmeticAddition           _accum_input_gate_bias;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_forget_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_forget_gate_coeff;
+    NEArithmeticAddition           _accum_forget_gate_bias;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_cell_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_cell_gate_coeff;
+    NEArithmeticAddition           _accum_cell_gate_bias;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_output_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_output_gate_coeff;
+    NEArithmeticAddition           _accum_output_gate_bias;
+    Tensor                         _input_gate_out1;
+    Tensor                         _input_gate_out2;
+    Tensor                         _input_gate_out3;
+    Tensor                         _input_gate_out4;
+    Tensor                         _forget_gate_out1;
+    Tensor                         _forget_gate_out2;
+    Tensor                         _forget_gate_out3;
+    Tensor                         _forget_gate_out4;
+    Tensor                         _forget_gate_out5;
+    Tensor                         _forget_gate_out6;
+    Tensor                         _cell_state_out1;
+    Tensor                         _cell_state_out2;
+    Tensor                         _cell_state_out3;
+    Tensor                         _cell_state_out4;
+    Tensor                         _cell_state_out5;
+    Tensor                         _output1;
+    Tensor                         _output2;
+    Tensor                         _output3;
+    Tensor                         _output4;
+    Tensor                         _cell_state_activation;
+    Tensor                         _output_state1;
+    Tensor                         _ones;
+    Tensor                         _input_layer_norm_out1;
+    Tensor                         _input_layer_norm_out2;
+    Tensor                         _forget_layer_norm_out1;
+    Tensor                         _forget_layer_norm_out2;
+    Tensor                         _cell_layer_norm_out1;
+    Tensor                         _cell_layer_norm_out2;
+    Tensor                         _output_layer_norm_out1;
+    Tensor                         _output_layer_norm_out2;
+    bool                           _run_peephole_opt;
+    bool                           _run_cifg_opt;
+    bool                           _perform_cell_clipping;
+    bool                           _has_projection_weights;
+    bool                           _perform_projection_clipping;
+    bool                           _is_prepared;
+    bool                           _is_layer_norm_lstm;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NELSTMLAYER_H */

diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index 2f3b8fd..377e173 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
index 5389f67..1f317f6 100644
--- a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
+++ b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
index f939725..cc4aa08 100644
--- a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
+++ b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
index b2f2b88..e76f6b3 100644
--- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEMagnitude.h b/arm_compute/runtime/NEON/functions/NEMagnitude.h
index 1685000..56c88c2 100644
--- a/arm_compute/runtime/NEON/functions/NEMagnitude.h
+++ b/arm_compute/runtime/NEON/functions/NEMagnitude.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
new file mode 100644
index 0000000..f13b4bd
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h

@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H
+#define ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include "arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
+#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Function to perform MaxUnpooling. This function calls the following NEON kernels:
+ *
+ * -# @ref NEMemsetKernel
+ * -# @ref NEMaxUnpoolingLayerKernel
+ */
+class NEMaxUnpoolingLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEMaxUnpoolingLayer();
+    /** Set the input and output tensors.
+     *
+     * @note Only supported pool size 2
+     *
+     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
+     * @param[out]     indices   The indices of the maximal values. Data type supported: U32.
+     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEMaxUnpoolingLayer
+     *
+     * @note Only supported pool size 2
+     *
+     * @param[in] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] indices   The indices of the maximal values. Data type supported: U32.
+     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    NEMemsetKernel            _memset_kernel;
+    NEMaxUnpoolingLayerKernel _unpooling_layer_kernel;
+};
+}
+#endif /* ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H */

diff --git a/arm_compute/runtime/NEON/functions/NEMeanStdDev.h b/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
index 954b222..120f703 100644
--- a/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
+++ b/arm_compute/runtime/NEON/functions/NEMeanStdDev.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
index 3ce2b27..132ab8a 100644
--- a/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEMedian3x3.h b/arm_compute/runtime/NEON/functions/NEMedian3x3.h
index 55064f8..8d860e2 100644
--- a/arm_compute/runtime/NEON/functions/NEMedian3x3.h
+++ b/arm_compute/runtime/NEON/functions/NEMedian3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
index 89b6874..caa66a0 100644
--- a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
+++ b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NENonLinearFilter.h b/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
index a758e04..d2a8583 100644
--- a/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
+++ b/arm_compute/runtime/NEON/functions/NENonLinearFilter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
index cb8b202..07d4b16 100644
--- a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
+++ b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index af34147..fcdba12 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,10 +28,10 @@
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -42,7 +42,7 @@
 
 /** Basic function to compute a normalization layer. This function calls the following NEON kernels:
  *
- * -# @ref NEPixelWiseMultiplicationKernel
+ * -# @ref NEPixelWiseMultiplication
  * -# @ref NEFillBorderKernel
  * -# @ref NENormalizationLayerKernel
  *
@@ -75,11 +75,10 @@
     void run() override;
 
 private:
-    MemoryGroup                     _memory_group;    /**< Function memory group */
-    NENormalizationLayerKernel      _norm_kernel;     /**< Normalization layer kernel */
-    NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */
-    NEFillBorderKernel              _border_handler;  /**< Kernel to handle  borders */
-    Tensor                          _input_squared;   /**< The intermediate buffer which stores results of squaring input */
+    MemoryGroup                _memory_group;  /**< Function memory group */
+    NENormalizationLayerKernel _norm_kernel;   /**< Normalization layer kernel */
+    NEPixelWiseMultiplication  _multiply_f;    /**< Pixel multiplication function */
+    Tensor                     _input_squared; /**< The intermediate buffer which stores results of squaring input */
 };
 }
 #endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */

diff --git a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
index 95068aa..141ee7e 100644
--- a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
+++ b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEPReluLayer.h b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
index 102a165..756058b 100644
--- a/arm_compute/runtime/NEON/functions/NEPReluLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPReluLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,21 +25,62 @@
 #define ARM_COMPUTE_NEPRELULAYER_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 class ITensor;
 
+namespace experimental
+{
 /** Basic function to run @ref NEArithmeticOperationKernel for PRELU
  *
  * @note The function implements an activation layer with the PRELU activation function.
  */
-class NEPReluLayer : public INESimpleFunction
+class NEPRelu : public INEOperator
 {
 public:
     /** Set the input and output tensor.
      *
+     * @param[in]  input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  alpha  Source alpha tensor info. Data types supported: same of @p input.
+     * @param[out] output Destination tensor info. Data type supported: same as @p input
+     */
+    void configure(const ITensorInfo *input, const ITensorInfo *alpha, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+     *
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] alpha  Source alpha tensor info. Data types supported: same of @p input.
+     * @param[in] output Destination tensor info. Data type supported: same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+};
+} // namespace experimental
+
+/** Basic function to run @ref NEArithmeticOperationKernel for PRELU
+ *
+ * @note The function implements an activation layer with the PRELU activation function.
+ */
+class NEPReluLayer : public IFunction
+{
+public:
+    /** Default Constructor */
+    NEPReluLayer();
+    /** Default Destructor */
+    ~NEPReluLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPReluLayer(const NEPReluLayer &) = delete;
+    /** Default move constructor */
+    NEPReluLayer(NEPReluLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPReluLayer &operator=(const NEPReluLayer &) = delete;
+    /** Default move assignment operator */
+    NEPReluLayer &operator=(NEPReluLayer &&);
+    /** Set the input and output tensor.
+     *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  alpha  Source alpha tensor. Data types supported: same of @p input.
      * @param[out] output Destination tensor. Data type supported: same as @p input
@@ -54,6 +95,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPRELULAYER_H */

diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h
index d3074e7..fcb7c36 100644
--- a/arm_compute/runtime/NEON/functions/NEPadLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h
index 4651b30..3be42c8 100644
--- a/arm_compute/runtime/NEON/functions/NEPermute.h
+++ b/arm_compute/runtime/NEON/functions/NEPermute.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEPhase.h b/arm_compute/runtime/NEON/functions/NEPhase.h
index 220681e..c492073 100644
--- a/arm_compute/runtime/NEON/functions/NEPhase.h
+++ b/arm_compute/runtime/NEON/functions/NEPhase.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 2b31032..3c1aa52 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,18 +25,128 @@
 #define ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 class ITensor;
 
+namespace experimental
+{
 /** Basic function to run @ref NEPixelWiseMultiplicationKernel */
-class NEPixelWiseMultiplication : public INESimpleFunction
+class NEPixelWiseMultiplication : public INEOperator
 {
 public:
     /** Initialise the kernel's inputs, output and convertion policy.
      *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in, out] input1          First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+     * @param[in]      scale           Scale to apply after multiplication.
+     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     * @param[in]      rounding_policy Rounding policy.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
+     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *
+     * @param[in] input1          First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] input2          Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] output          Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+     * @param[in] scale           Scale to apply after multiplication.
+     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     * @param[in] rounding_policy Rounding policy.
+     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */
+class NEComplexPixelWiseMultiplication : public INEOperator
+{
+public:
+    /** Initialise the kernel's inputs, output.
+     *
+     * @param[in, out] input1   An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2   An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output   The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
+     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication
+     *
+     * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
+     * @param[in] input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace experimental
+
+/** Basic function to run @ref NEPixelWiseMultiplicationKernel */
+class NEPixelWiseMultiplication : public IFunction
+{
+public:
+    /** Default Constructor */
+    NEPixelWiseMultiplication();
+    /** Default Destructor */
+    ~NEPixelWiseMultiplication();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplication(const NEPixelWiseMultiplication &) = delete;
+    /** Default move constructor */
+    NEPixelWiseMultiplication(NEPixelWiseMultiplication &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplication &operator=(const NEPixelWiseMultiplication &) = delete;
+    /** Default move assignment operator */
+    NEPixelWiseMultiplication &operator=(NEPixelWiseMultiplication &&);
+    /** Initialise the kernel's inputs, output and convertion policy.
+     *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
@@ -59,7 +169,7 @@
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
      *
@@ -87,12 +197,31 @@
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */
-class NEComplexPixelWiseMultiplication : public INESimpleFunction
+class NEComplexPixelWiseMultiplication : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEComplexPixelWiseMultiplication();
+    /** Default Destructor */
+    ~NEComplexPixelWiseMultiplication();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEComplexPixelWiseMultiplication(const NEComplexPixelWiseMultiplication &) = delete;
+    /** Default move constructor */
+    NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEComplexPixelWiseMultiplication &operator=(const NEComplexPixelWiseMultiplication &) = delete;
+    /** Default move assignment operator */
+    NEComplexPixelWiseMultiplication &operator=(NEComplexPixelWiseMultiplication &&);
     /** Initialise the kernel's inputs, output.
      *
      * @param[in, out] input1   An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -111,6 +240,13 @@
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 }
 #endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H */

diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index e43741c..000c754 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,10 +58,10 @@
      *
      * @note F16 is supported for pool sizes 2 and 3 only
      *
-     * @param[in] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in] input     Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     * @param[in] indices   (optional) Tensor info of the indices of the maximal values. Data type supported: U32.
      *
      * @return a status
      */

diff --git a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
index 242460d..d4bb42f 100644
--- a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index d1cc962..59dd567 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,16 +24,16 @@
 #ifndef ARM_COMPUTE_NEQLSTMLAYER_H
 #define ARM_COMPUTE_NEQLSTMLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 #include "arm_compute/core/NEON/kernels/NECopyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 #include "arm_compute/runtime/common/LSTMParams.h"
@@ -48,13 +48,13 @@
  * This function calls the following NEON functions/kernels:
  *
  * -# @ref NEActivationLayer                                     Activation functions (tanh and logistic)
- * -# @ref NEArithmeticAdditionKernel                            Elementwise addition
+ * -# @ref NEArithmeticAddition                            Elementwise addition
  * -# @ref NEArithmeticSubtractionKernel                         Elementwise subtraction
  * -# @ref NECopyKernel                                          Copy kernel for copying output_state_out to output
  * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
  * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
  * -# @ref NEGEMMLowpMatrixAReductionKernel                      For precomputing effective biases to use
- * -# @ref NEPixelWiseMultiplicationKernel                       Elementwise multiplication
+ * -# @ref NEPixelWiseMultiplication                       Elementwise multiplication
  * -# @ref NETranspose                                           Transpose function for reshaping the weights
  * */
 class NEQLSTMLayer : public IFunction
@@ -254,51 +254,51 @@
     NEGEMMLowpMatrixAReductionKernel _input_to_output_reduction{};
     NEGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{};
     NEGEMMLowpMatrixAReductionKernel _projection_reduction{};
-    NEArithmeticAdditionKernel       _projection_bias_add{};
+    NEArithmeticAddition             _projection_bias_add{};
     NEGEMMLowpMatrixMultiplyCore     _mm_input_to_forget{};
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_forget{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_forget{};
+    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_forget{};
     NEGEMMLowpOutputStage            _input_to_forget_outstage{};
     NEGEMMLowpOutputStage            _recurrent_to_forget_outstage{};
     NEGEMMLowpOutputStage            _cell_to_forget_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_forget{};
-    NEArithmeticAdditionKernel       _accumulate_cell_forget{};
+    NEArithmeticAddition             _accumulate_input_recurrent_forget{};
+    NEArithmeticAddition             _accumulate_cell_forget{};
     NEActivationLayer                _forget_gate_sigmoid{};
     NEGEMMLowpMatrixMultiplyCore     _mm_input_to_cell{};
     NEGEMMLowpOutputStage            _input_to_cell_outstage{};
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_cell{};
     NEGEMMLowpOutputStage            _recurrent_to_cell_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_modulation{};
+    NEArithmeticAddition             _accumulate_input_recurrent_modulation{};
     NEActivationLayer                _cell_gate_tanh{};
-    NEArithmeticSubtractionKernel    _input_gate_sub{};
+    NEArithmeticSubtraction          _input_gate_sub{};
     NEGEMMLowpMatrixMultiplyCore     _mm_input_to_input{};
     NEGEMMLowpOutputStage            _input_to_input_outstage{};
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_input{};
     NEGEMMLowpOutputStage            _recurrent_to_input_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_input{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_input{};
+    NEArithmeticAddition             _accumulate_input_recurrent_input{};
+    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_input{};
     NEGEMMLowpOutputStage            _cell_to_input_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_cell_input{};
+    NEArithmeticAddition             _accumulate_cell_input{};
     NEActivationLayer                _input_gate_sigmoid{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_forget_cell{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_input_cell{};
-    NEArithmeticAdditionKernel       _add_forget_cell{};
+    NEPixelWiseMultiplication        _pixelwise_mul_forget_cell{};
+    NEPixelWiseMultiplication        _pixelwise_mul_input_cell{};
+    NEArithmeticAddition             _add_forget_cell{};
     NEActivationLayer                _cell_clip{};
     NEGEMMLowpMatrixMultiplyCore     _mm_input_to_output{};
     NEGEMMLowpOutputStage            _input_to_output_outstage{};
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_output{};
     NEGEMMLowpOutputStage            _recurrent_to_output_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_output{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_output{};
+    NEArithmeticAddition             _accumulate_input_recurrent_output{};
+    NEPixelWiseMultiplication        _pixelwise_mul_cell_to_output{};
     NEGEMMLowpOutputStage            _cell_to_output_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_cell_to_output{};
+    NEArithmeticAddition             _accumulate_cell_to_output{};
     NEActivationLayer                _output_gate_sigmoid{};
     NEActivationLayer                _hidden_tanh{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_hidden{};
+    NEPixelWiseMultiplication        _pixelwise_mul_hidden{};
     NEGEMMLowpOutputStage            _hidden_outstage{};
     NEGEMMLowpMatrixMultiplyCore     _mm_projection{};
     NEGEMMLowpOutputStage            _projection_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_projection{};
+    NEArithmeticAddition             _accumulate_projection{};
     NEActivationLayer                _projection_clip{};
 
     TensorCopyKernel _projection_bias_copy{};
@@ -311,7 +311,10 @@
     NECopyKernel _copy_output{};
 
     // Tensor pointers
-    const ITensor *_input_to_input_weights{ nullptr };
+    const ITensor *_input_to_input_weights
+    {
+        nullptr
+    };
     const ITensor *_recurrent_to_input_weights{ nullptr };
     const ITensor *_projection_bias{ nullptr };
     const ITensor *_input_to_forget_weights{ nullptr };
@@ -370,7 +373,10 @@
     {
         // Output quantization scale will be different, but ignored here
         // since it will be configured at configure() stage.
-        const TensorInfo out{ in };
+        const TensorInfo out
+        {
+            in
+        };
         return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
     }
 

diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
index fc317be..266b3df 100644
--- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index 0bfb905..12e3ef9 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_NERNNLAYER_H
 #define ARM_COMPUTE_NERNNLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NECopyKernel.h"
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 
@@ -82,16 +82,16 @@
     void prepare() override;
 
 private:
-    MemoryGroup                _memory_group;
-    NEGEMM                     _gemm_state_f;
-    NEArithmeticAdditionKernel _add_kernel;
-    NEActivationLayerKernel    _activation_kernel;
-    NEFullyConnectedLayer      _fully_connected;
-    NECopyKernel               _copy_kernel;
-    Tensor                     _fully_connected_out;
-    Tensor                     _gemm_output;
-    Tensor                     _add_output;
-    bool                       _is_prepared;
+    MemoryGroup           _memory_group;
+    NEGEMM                _gemm_state_f;
+    NEArithmeticAddition  _add_f;
+    NEActivationLayer     _activation;
+    NEFullyConnectedLayer _fully_connected;
+    NECopyKernel          _copy_kernel;
+    Tensor                _fully_connected_out;
+    Tensor                _gemm_output;
+    Tensor                _add_output;
+    bool                  _is_prepared;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NERNNLAYER_H */

diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
index 7a50496..3e8db55 100644
--- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEROIALIGNLAYER_H
 #define ARM_COMPUTE_NEROIALIGNLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
 
 namespace arm_compute

diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 887b571..08885d0 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NERange.h b/arm_compute/runtime/NEON/functions/NERange.h
index 83ca625..04889d4 100644
--- a/arm_compute/runtime/NEON/functions/NERange.h
+++ b/arm_compute/runtime/NEON/functions/NERange.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index 3c7cc21..a1b6e34 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index abda415..ab6928b 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,7 @@
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
-#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -49,17 +48,17 @@
     NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
-     * @param[out] output    Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]  axis      Dimension along which to reduce. Supported reduction axis : 0
-     * @param[in]  op        Reduction operation to perform.
-     * @param[in]  keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
+     * @param[in, out] input     Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[out]     output    Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]      axis      Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in]      op        Reduction operation to perform.
+     * @param[in]      keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
     void configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperation.
      *
-     * @param[in] input     Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[in] input     Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. Data layouts supported: NCHW.
      * @param[in] output    Destination tensor info. Data types and data layouts supported: same as @p input.
      * @param[in] axis      Dimension along which to reduce. Supported reduction axis : 0
      * @param[in] op        Reduction operation to perform.
@@ -76,7 +75,7 @@
     MemoryGroup                _memory_group;
     NEReductionOperationKernel _reduction_kernel;
     NEFillBorderKernel         _fill_border_kernel;
-    NEReshapeLayerKernel       _reshape_kernel;
+    NEReshapeLayer             _reshape;
     Tensor                     _output_internal;
     size_t                     _window_split;
     int                        _reduction_axis;

diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
index 05a7a8f..f087bd2 100644
--- a/arm_compute/runtime/NEON/functions/NERemap.h
+++ b/arm_compute/runtime/NEON/functions/NERemap.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEReorgLayer.h b/arm_compute/runtime/NEON/functions/NEReorgLayer.h
index 8ef7f8a..19385e1 100644
--- a/arm_compute/runtime/NEON/functions/NEReorgLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReorgLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
index d664384..2ca6660 100644
--- a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,11 @@
 #ifndef ARM_COMPUTE_NERESHAPELAYER_H
 #define ARM_COMPUTE_NERESHAPELAYER_H
 
+#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
+#include "arm_compute/runtime/Types.h"
 
 namespace arm_compute
 {
@@ -33,24 +36,67 @@
 class ITensor;
 
 /** Basic function to run @ref NEReshapeLayerKernel */
-class NEReshapeLayer : public INESimpleFunctionNoBorder
+class NEReshapeLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEReshapeLayer();
+    /** Default Destructor */
+    ~NEReshapeLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReshapeLayer(const NEReshapeLayer &) = delete;
+    /** Default move constructor */
+    NEReshapeLayer(NEReshapeLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReshapeLayer &operator=(const NEReshapeLayer &) = delete;
+    /** Default move assignment operator */
+    NEReshapeLayer &operator=(NEReshapeLayer &&);
     /** Initialise the kernel's inputs and outputs
      *
-     * @param[in]  input  First tensor input. Data type supported: All
+     * @param[in]  input  Input tensor. Data type supported: All
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayer
      *
-     * @param[in] input  First tensor info. Data type supported: All
+     * @param[in] input  Input tensor info. Data type supported: All
+     * @param[in] output Output tensor info. Data type supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+namespace experimental
+{
+/** Basic function to run @ref NEReshapeLayerKernel */
+class NEReshape : public INEOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in]  input  Input tensor info. Data type supported: All
+     * @param[out] output Output info. Data type supported: Same as @p input
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayer
+     *
+     * @param[in] input  Input tensor info. Data type supported: All
      * @param[in] output Output tensor info. Data type supported: Same as @p input
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 };
+} // namespace experimental
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NERESHAPELAYER_H */

diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h
index ab5a5d0..7a4566d 100644
--- a/arm_compute/runtime/NEON/functions/NEReverse.h
+++ b/arm_compute/runtime/NEON/functions/NEReverse.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 75acb96..f149e3b 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@
     NEScale();
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in, out] input                 Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in, out] input                 Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[out]     output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]      policy                The interpolation type.
      * @param[in]      border_mode           Strategy to use for borders.
@@ -56,11 +56,19 @@
      * @param[in]      use_padding           (Optional) Is padding in use or not. Defaults to true.
      * @param[in]      align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.08)
     void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
                    SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in, out] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]     output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]      info   @ref ScaleKernelInfo to be used for configuration
+     */
+    void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEScale
      *
-     * @param[in] input                 Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in] input                 Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[in] output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in] policy                The interpolation type.
      * @param[in] border_mode           Strategy to use for borders.
@@ -71,8 +79,18 @@
      *
      * @return a status
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.08)
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode,
                            PixelValue constant_border_value = PixelValue(), SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEScale
+     *
+     * @param[in] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in] info   @ref ScaleKernelInfo to be used for validation
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info);
 
     // Inherited methods overridden:
     void run() override;
@@ -84,7 +102,6 @@
     NEScaleKernel      _scale_kernel;   /**< Kernel to perform the scaling */
     NEFillBorderKernel _border_handler; /**< kernel to handle tensor borders */
     bool               _use_padding;    /**< Is padding used on the tensors */
-    bool               _align_corners;  /**< Align corners of input and output */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NESCALEIMAGE_H */

diff --git a/arm_compute/runtime/NEON/functions/NEScharr3x3.h b/arm_compute/runtime/NEON/functions/NEScharr3x3.h
index 6091121..0113104 100644
--- a/arm_compute/runtime/NEON/functions/NEScharr3x3.h
+++ b/arm_compute/runtime/NEON/functions/NEScharr3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NESelect.h b/arm_compute/runtime/NEON/functions/NESelect.h
index 6ac3280..258ac5d 100644
--- a/arm_compute/runtime/NEON/functions/NESelect.h
+++ b/arm_compute/runtime/NEON/functions/NESelect.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h b/arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h
index a162b65..a814802 100644
--- a/arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h
+++ b/arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NESlice.h b/arm_compute/runtime/NEON/functions/NESlice.h
index 834ec27..2862877 100644
--- a/arm_compute/runtime/NEON/functions/NESlice.h
+++ b/arm_compute/runtime/NEON/functions/NESlice.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,15 +24,18 @@
 #ifndef ARM_COMPUTE_NE_SLICE_H
 #define ARM_COMPUTE_NE_SLICE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 // Forward Declarations
 class ITensor;
 
+namespace experimental
+{
 /** Basic function to perform tensor slicing */
-class NESlice : public INESimpleFunctionNoBorder
+class NESlice : public INEOperator
 {
 public:
     /** Configure kernel
@@ -42,6 +45,55 @@
      * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
      * @note End indices are not inclusive unless negative.
      *
+     * @param[in]  input  Source tensor info. Data type supported: All
+     * @param[out] output Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NESlice
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Start indices must be non-negative. 0 <= starts[i]
+     * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
+     * @note End indices are not inclusive unless negative.
+     *
+     * @param[in] input  Source tensor info. Data type supported: All
+     * @param[in] output Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     *
+     * @return A status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+};
+} // namespace experimental
+
+/** Basic function to perform tensor slicing */
+class NESlice : public IFunction
+{
+public:
+    /** Default Constructor */
+    NESlice();
+    /** Default Destructor */
+    ~NESlice();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESlice(const NESlice &) = delete;
+    /** Default move constructor */
+    NESlice(NESlice &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESlice &operator=(const NESlice &) = delete;
+    /** Default move assignment operator */
+    NESlice &operator=(NESlice &&);
+
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Start indices must be non-negative. 0 <= starts[i]
+     * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
+     * @note End indices are not inclusive unless negative.
+     *
      * @param[in]  input  Source tensor. Data type supported: All
      * @param[out] output Destination tensor. Data type supported: Same as @p input
      * @param[in]  starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
@@ -64,6 +116,13 @@
      * @return A status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NE_SLICE_H */

diff --git a/arm_compute/runtime/NEON/functions/NESobel3x3.h b/arm_compute/runtime/NEON/functions/NESobel3x3.h
index 0cd633e..4dbdfd2 100644
--- a/arm_compute/runtime/NEON/functions/NESobel3x3.h
+++ b/arm_compute/runtime/NEON/functions/NESobel3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NESobel5x5.h b/arm_compute/runtime/NEON/functions/NESobel5x5.h
index af52292..b5365bc 100644
--- a/arm_compute/runtime/NEON/functions/NESobel5x5.h
+++ b/arm_compute/runtime/NEON/functions/NESobel5x5.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NESobel7x7.h b/arm_compute/runtime/NEON/functions/NESobel7x7.h
index e909888..925444d 100644
--- a/arm_compute/runtime/NEON/functions/NESobel7x7.h
+++ b/arm_compute/runtime/NEON/functions/NESobel7x7.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index b80ceaf..9fb4d85 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,11 +25,11 @@
 #define ARM_COMPUTE_NESOFTMAXLAYER_H
 
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -39,10 +39,10 @@
 /** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
  *
  * Softmax is calculated by :
- * @f[ out = \frac{e^{x - max(x)}}{\sum{e^{x - max(x)}}} @f]
+ * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f]
  *
  * Log Softmax is calculated by :
- * @f[ out = (x - max(x)) - \sum{e^{x - max(x)}} @f]
+ * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
  *
  * This function runs the following kernels:
  * -# @ref NEFillBorderKernel
@@ -70,25 +70,19 @@
      *                       last value of each row to the nearest multiple.
      * @param[out]    output Destination tensor. Data types supported: same as @p input.
      * @param[in]     beta   (Optional) A scaling factor for the exponent.
-     * @param[in]     axis   (Optional) Reduction axis. Defaults to -1.
-     *                       Negative index is used to specify axis from the end (e.g. -1 for the last axis).Must be in range [-input_num_dimensions, input_num_dimensions).
-     *                       It has the purpose of squashing the first @p axis dimensions together. For instance, given a [4x4x4x4] image,
-     *                       when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in]     axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
      */
-    void configure(ITensor *input, ITensor *output, float beta = 1.0f, int32_t axis = -1);
+    void configure(ITensor *input, ITensor *output, float beta = 1.0f, int32_t axis = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref NESoftmaxLayer
      *
      * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output Destination tensor info. Data types supported: same as @p input
      * @param[in] beta   (Optional) A scaling factor for the exponent.
-     * @param[in] axis   (Optional) Reduction axis. Defaults to -1.
-     *                   Negative index is used to specify axis from the end (e.g. -1 for the last axis).Must be in range [-input_num_dimensions, input_num_dimensions).
-     *                   It has the purpose of squashing the first @p axis dimensions together. For instance, given a [4x4x4x4] image,
-     *                   when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, int32_t axis = -1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, int32_t axis = 0);
 
     // Inherited methods overridden:
     void run() override;
@@ -103,19 +97,16 @@
      *
      * @param[in] input  Original source tensor.
      * @param[in] output Original destination tensor.
-     * @param[in] axis   (Optional) Reduction axis. Defaults to -1.
-     *                   Negative index is used to specify axis from the end (e.g. -1 for the last axis).Must be in range [-input_num_dimensions, input_num_dimensions).
-     *                   It has the purpose of squashing the first @p axis dimensions together. For instance, given a [4x4x4x4] image,
-     *                   when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in] axis   (Optional) The last axis of the first n dimensions (inclusive)to reduce. Only supports axis 0.
      */
     void configure_reshape_input_kernel(const ITensor *input, const ITensor *output, int32_t axis);
 
     MemoryGroup                     _memory_group;
     NELogits1DMaxKernel             _max_kernel;
     NELogits1DSoftmaxKernel<IS_LOG> _softmax_kernel;
-    std::unique_ptr<INEKernel>      _flat_or_reshape_kernel_ptr;
+    std::unique_ptr<IFunction>      _flat_or_reshape_ptr;
     NEFillBorderKernel              _fill_border_kernel;
-    NEReshapeLayerKernel            _reshape_kernel;
+    NEReshapeLayer                  _reshape;
     Tensor                          _max;
     Tensor                          _tmp;
     Tensor                          _input_flattened;

diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
index 75fa50c..6f339e8 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
index 6a7a9c8..16a9c80 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NESplit.h b/arm_compute/runtime/NEON/functions/NESplit.h
index 69aef79..ede5ecf 100644
--- a/arm_compute/runtime/NEON/functions/NESplit.h
+++ b/arm_compute/runtime/NEON/functions/NESplit.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEStackLayer.h b/arm_compute/runtime/NEON/functions/NEStackLayer.h
index 9288035..4180b6d 100644
--- a/arm_compute/runtime/NEON/functions/NEStackLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEStackLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEStridedSlice.h b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
index 6d5e639..f9c94f5 100644
--- a/arm_compute/runtime/NEON/functions/NEStridedSlice.h
+++ b/arm_compute/runtime/NEON/functions/NEStridedSlice.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,79 @@
 #ifndef ARM_COMPUTE_NE_STRIDED_SLICE_H
 #define ARM_COMPUTE_NE_STRIDED_SLICE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 // Forward Declarations
 class ITensor;
 
+namespace experimental
+{
 /** Basic function to run @ref NEStridedSliceKernel */
-class NEStridedSlice : public INESimpleFunction
+class NEStridedSlice : public INEOperator
 {
 public:
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
      *
+     * @param[in]  input            Source tensor info. Data type supported: All
+     * @param[out] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output,
+                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in] input            Source tensor info. Data type supported: All
+     * @param[in] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+};
+} // namespace experimental
+
+/** Basic function to run @ref NEStridedSliceKernel */
+class NEStridedSlice : public IFunction
+{
+public:
+    /** Default Constructor */
+    NEStridedSlice();
+    /** Default Destructor */
+    ~NEStridedSlice();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStridedSlice(const NEStridedSlice &) = delete;
+    /** Default move constructor */
+    NEStridedSlice(NEStridedSlice &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStridedSlice &operator=(const NEStridedSlice &) = delete;
+    /** Default move assignment operator */
+    NEStridedSlice &operator=(NEStridedSlice &&);
+
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
      * @param[in]  input            Source tensor. Data type supported: All
      * @param[out] output           Destination tensor. Data type supported: Same as @p input
      * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
@@ -70,6 +128,13 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *output,
                            const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                            int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NE_STRIDED_SLICE_H */

diff --git a/arm_compute/runtime/NEON/functions/NETableLookup.h b/arm_compute/runtime/NEON/functions/NETableLookup.h
index b0685af..fb08274 100644
--- a/arm_compute/runtime/NEON/functions/NETableLookup.h
+++ b/arm_compute/runtime/NEON/functions/NETableLookup.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEThreshold.h b/arm_compute/runtime/NEON/functions/NEThreshold.h
index c955283..cb9b696 100644
--- a/arm_compute/runtime/NEON/functions/NEThreshold.h
+++ b/arm_compute/runtime/NEON/functions/NEThreshold.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_NETHRESHOLD_H
 #define ARM_COMPUTE_NETHRESHOLD_H
 
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
@@ -31,6 +32,7 @@
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
 
 /** Basic function to run @ref NEThresholdKernel */
@@ -47,8 +49,25 @@
      * @param[in]  type        Thresholding type. Can either be BINARY or RANGE.
      * @param[in]  upper       Upper threshold. Only used with RANGE thresholding
      */
+    ARM_COMPUTE_DEPRECATED_REL(20.08)
     void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value = 0, uint8_t true_value = 0,
                    ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0);
+    /** Initialise the function's source, destination, thresholds and threshold type
+     *
+     * @param[in]  input  First tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     * @param[in]  info   Threshold descriptor
+     */
+    void configure(const ITensor *input, ITensor *output, const ThresholdKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEThreshold
+     *
+     * @param[in] input  First tensor input. Data type supported: U8.
+     * @param[in] output Output tensor. Data type supported: U8.
+     * @param[in] info   Threshold descriptor.
+     *
+     * @return A status, containing an error code in case of failure
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ThresholdKernelInfo &info);
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NETHRESHOLD_H */

diff --git a/arm_compute/runtime/NEON/functions/NETile.h b/arm_compute/runtime/NEON/functions/NETile.h
index 14d4f22..53a94db 100644
--- a/arm_compute/runtime/NEON/functions/NETile.h
+++ b/arm_compute/runtime/NEON/functions/NETile.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 03c90e5..1169459 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h
index dbb04f0..2e3a679 100644
--- a/arm_compute/runtime/NEON/functions/NEUnstack.h
+++ b/arm_compute/runtime/NEON/functions/NEUnstack.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,8 +47,8 @@
     NEUnstack();
     /** Set the input, output and unstacking axis.
      *
-     * @param[in]     input         A tensor to be unstacked. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in,out] output_vector A vector of tensors. Data types supported: Same as @p input.
+     * @param[in]     input         A tensor to be unstacked. Data type supported: All.
+     * @param[in,out] output_vector A vector of tensors. Data types supported: same as @p input.
      *                              Note: The number of elements of the vector will be used as the number of slices to be taken from the axis.
      * @param[in]     axis          The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
@@ -56,8 +56,8 @@
     void configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis);
     /** Static function to check if given info will lead to a valid configuration of @ref NEUnstack
      *
-     * @param[in] input         Input tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-     * @param[in] output_vector Vector of output tensors' info. Data types supported: Same as @p input.
+     * @param[in] input         Input tensor info. Data type supported: All.
+     * @param[in] output_vector Vector of output tensors' info. Data types supported: same as @p input.
      * @param[in] axis          The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
      * @return a status

diff --git a/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h b/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h
index ff465e5..f9145f1 100644
--- a/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEWarpAffine.h b/arm_compute/runtime/NEON/functions/NEWarpAffine.h
index 768ef0c..eb7492b 100644
--- a/arm_compute/runtime/NEON/functions/NEWarpAffine.h
+++ b/arm_compute/runtime/NEON/functions/NEWarpAffine.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEWarpPerspective.h b/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
index 66fb9ac..c439e82 100644
--- a/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
+++ b/arm_compute/runtime/NEON/functions/NEWarpPerspective.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index 4a8fe61..4090c8c 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/NEYOLOLayer.h b/arm_compute/runtime/NEON/functions/NEYOLOLayer.h
index 5e0c34b..8821960 100644
--- a/arm_compute/runtime/NEON/functions/NEYOLOLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEYOLOLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
index f16bb46..7f63717 100644
--- a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
+++ b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
index ed00833..56a31cc 100644
--- a/arm_compute/runtime/OMP/OMPScheduler.h
+++ b/arm_compute/runtime/OMP/OMPScheduler.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,6 +55,18 @@
      */
     void schedule(ICPPKernel *kernel, const Hints &hints) override;
 
+    /** Multithread the execution of the passed kernel if possible.
+     *
+     * The kernel will run on a single thread if any of these conditions is true:
+     * - ICPPKernel::is_parallelisable() returns false
+     * - The scheduler has been initialized with only one thread.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] tensors Vector containing the tensors to operate on.
+     */
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors) override;
+
 protected:
     /** Execute all the passed workloads
      *

diff --git a/arm_compute/runtime/OffsetLifetimeManager.h b/arm_compute/runtime/OffsetLifetimeManager.h
index 26ade01..2eef61a 100644
--- a/arm_compute/runtime/OffsetLifetimeManager.h
+++ b/arm_compute/runtime/OffsetLifetimeManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/OffsetMemoryPool.h b/arm_compute/runtime/OffsetMemoryPool.h
index dff3f23..a5c363d 100644
--- a/arm_compute/runtime/OffsetMemoryPool.h
+++ b/arm_compute/runtime/OffsetMemoryPool.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/OperatorTensor.h b/arm_compute/runtime/OperatorTensor.h
new file mode 100644
index 0000000..92ae019
--- /dev/null
+++ b/arm_compute/runtime/OperatorTensor.h

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_OPERATORTENSOR_H
+#define ARM_COMPUTE_OPERATORTENSOR_H
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/Types.h"
+#include "arm_compute/runtime/experimental/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class TensorInfo;
+class IRuntimeContext;
+class IMemory;
+namespace experimental
+{
+/** Basic implementation of the tensor interface */
+class OperatorTensor : public ITensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] info   Pointer to the tensor info.
+     * @param[in] memory Pointer to the memory info.
+     *
+     */
+    OperatorTensor(ITensorInfo *info, IMemory *memory);
+    /** Destructor: free the tensor's memory */
+    ~OperatorTensor() = default;
+    /** Allow instances of this class to be move constructed */
+    OperatorTensor(OperatorTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    OperatorTensor &operator=(OperatorTensor &&) = default;
+    /** Prevent instances of this class to be copy assigned */
+    OperatorTensor &operator=(const OperatorTensor &) = delete;
+    /** Prevent instances of this class to be copy constructed */
+    OperatorTensor(const OperatorTensor &) = delete;
+
+    // Inherited methods overridden:
+    arm_compute::ITensorInfo *info() const override;
+    arm_compute::ITensorInfo *info() override;
+    uint8_t                  *buffer() const override;
+
+private:
+    arm_compute::ITensorInfo *_info;
+    arm_compute::IMemory     *_memory;
+    MemoryType                _mem_type;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_OPERATORTENSOR_H */

diff --git a/arm_compute/runtime/PoolManager.h b/arm_compute/runtime/PoolManager.h
index 2ee00bc..cc50fc0 100644
--- a/arm_compute/runtime/PoolManager.h
+++ b/arm_compute/runtime/PoolManager.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/Pyramid.h b/arm_compute/runtime/Pyramid.h
index 3dc7259..6e6feca 100644
--- a/arm_compute/runtime/Pyramid.h
+++ b/arm_compute/runtime/Pyramid.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/RuntimeContext.h b/arm_compute/runtime/RuntimeContext.h
index 361d601..31e2d69 100644
--- a/arm_compute/runtime/RuntimeContext.h
+++ b/arm_compute/runtime/RuntimeContext.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
index f8e9580..9e8add1 100644
--- a/arm_compute/runtime/Scheduler.h
+++ b/arm_compute/runtime/Scheduler.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/SchedulerFactory.h b/arm_compute/runtime/SchedulerFactory.h
index 5c24857..647e486 100644
--- a/arm_compute/runtime/SchedulerFactory.h
+++ b/arm_compute/runtime/SchedulerFactory.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/SingleThreadScheduler.h b/arm_compute/runtime/SingleThreadScheduler.h
index 3f279eb..d45730e 100644
--- a/arm_compute/runtime/SingleThreadScheduler.h
+++ b/arm_compute/runtime/SingleThreadScheduler.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,6 +50,13 @@
      * @param[in] hints  Hints for the scheduler.
      */
     void schedule(ICPPKernel *kernel, const Hints &hints) override;
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] tensors Vector containing the tensors to operate on.
+     */
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors) override;
 
 protected:
     /** Will run the workloads sequentially and in order.

diff --git a/arm_compute/runtime/SubTensor.h b/arm_compute/runtime/SubTensor.h
index b296877..3ca066e 100644
--- a/arm_compute/runtime/SubTensor.h
+++ b/arm_compute/runtime/SubTensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/Tensor.h b/arm_compute/runtime/Tensor.h
index 71ed294..172c896 100644
--- a/arm_compute/runtime/Tensor.h
+++ b/arm_compute/runtime/Tensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h
index c0c6f2a..a5e16c4 100644
--- a/arm_compute/runtime/TensorAllocator.h
+++ b/arm_compute/runtime/TensorAllocator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/Types.h b/arm_compute/runtime/Types.h
index 1b6e1bd..f5b7b06 100644
--- a/arm_compute/runtime/Types.h
+++ b/arm_compute/runtime/Types.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/Utils.h b/arm_compute/runtime/Utils.h
index 9a5b20e..6e36297 100644
--- a/arm_compute/runtime/Utils.h
+++ b/arm_compute/runtime/Utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/runtime/common/LSTMParams.h b/arm_compute/runtime/common/LSTMParams.h
index 5e4a76a..ffb4ddd 100644
--- a/arm_compute/runtime/common/LSTMParams.h
+++ b/arm_compute/runtime/common/LSTMParams.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,7 +81,7 @@
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_cifg_params(const T *input_to_input_weights, const T *recurrent_to_input_weights, const T *cell_to_input_weights, const T *input_gate_bias)
+    LSTMParams &set_cifg_params(const T *input_to_input_weights, const T *recurrent_to_input_weights, T *cell_to_input_weights, const T *input_gate_bias)
     {
         _input_to_input_weights     = input_to_input_weights;
         _recurrent_to_input_weights = recurrent_to_input_weights;
@@ -111,7 +111,7 @@
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_peephole_params(const T *cell_to_forget_weights, const T *cell_to_output_weights)
+    LSTMParams &set_peephole_params(T *cell_to_forget_weights, T *cell_to_output_weights)
     {
         _cell_to_forget_weights = cell_to_forget_weights;
         _cell_to_output_weights = cell_to_output_weights;
@@ -127,8 +127,8 @@
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_layer_normalization_params(const T *input_layer_norm_weights, const T *forget_layer_norm_weights,
-                                               const T *cell_layer_norm_weights, const T *output_layer_norm_weights)
+    LSTMParams &set_layer_normalization_params(T *input_layer_norm_weights, T *forget_layer_norm_weights,
+                                               T *cell_layer_norm_weights, T *output_layer_norm_weights)
     {
         _input_layer_norm_weights  = input_layer_norm_weights;
         _forget_layer_norm_weights = forget_layer_norm_weights;
@@ -204,7 +204,7 @@
         return _recurrent_to_input_weights;
     }
 
-    const T *cell_to_input_weights() const
+    T *cell_to_input_weights() const
     {
         return _cell_to_input_weights;
     }
@@ -214,12 +214,12 @@
         return _input_gate_bias;
     }
 
-    const T *cell_to_forget_weights() const
+    T *cell_to_forget_weights() const
     {
         return _cell_to_forget_weights;
     }
 
-    const T *cell_to_output_weights() const
+    T *cell_to_output_weights() const
     {
         return _cell_to_output_weights;
     }
@@ -234,22 +234,22 @@
         return _projection_bias;
     }
 
-    const T *input_layer_norm_weights() const
+    T *input_layer_norm_weights() const
     {
         return _input_layer_norm_weights;
     }
 
-    const T *forget_layer_norm_weights() const
+    T *forget_layer_norm_weights() const
     {
         return _forget_layer_norm_weights;
     }
 
-    const T *cell_layer_norm_weights() const
+    T *cell_layer_norm_weights() const
     {
         return _cell_layer_norm_weights;
     }
 
-    const T *output_layer_norm_weights() const
+    T *output_layer_norm_weights() const
     {
         return _output_layer_norm_weights;
     }
@@ -317,16 +317,16 @@
 private:
     const T *_input_to_input_weights;
     const T *_recurrent_to_input_weights;
-    const T *_cell_to_input_weights;
+    T       *_cell_to_input_weights;
     const T *_input_gate_bias;
-    const T *_cell_to_forget_weights;
-    const T *_cell_to_output_weights;
+    T       *_cell_to_forget_weights;
+    T       *_cell_to_output_weights;
     const T *_projection_weights;
     const T *_projection_bias;
-    const T *_input_layer_norm_weights;
-    const T *_forget_layer_norm_weights;
-    const T *_cell_layer_norm_weights;
-    const T *_output_layer_norm_weights;
+    T       *_input_layer_norm_weights;
+    T       *_forget_layer_norm_weights;
+    T       *_cell_layer_norm_weights;
+    T       *_output_layer_norm_weights;
     float    _cell_clip;
     float    _projection_clip;
     float    _input_intermediate_scale;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/arm_compute/runtime/experimental/Types.h
similarity index 60%
copy from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
copy to arm_compute/runtime/experimental/Types.h
index 36f84d8..54ea47d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/arm_compute/runtime/experimental/Types.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,37 +21,22 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#pragma once
+#ifndef ARM_COMPUTE_RUNTIME_EXPERIMENTAL_TYPES_H
+#define ARM_COMPUTE_RUNTIME_EXPERIMENTAL_TYPES_H
 
-#ifdef __aarch64__
+#include <vector>
 
-namespace arm_gemm {
-
-// Actual kernel implementations
-void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
-
-// Transposed SGEMV strategy class.
-class sgemv_trans {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
-
-    /* Kernel blocking parameters */
-    static unsigned int out_width() {
-        return 96;
-    }
-
-    static unsigned int k_unroll() {
-        return 1;
-    }
-
-    kern_type kernel=a64_sgemv_trans;
-
-    sgemv_trans(const CPUInfo *ci) { UNUSED(ci); }
+namespace arm_compute
+{
+namespace experimental
+{
+/** Memory type */
+enum class MemoryType
+{
+    CPU,
+    CL,
+    GLES
 };
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_RUNTIME_EXPERIMENTAL_TYPES_H */

diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index f5b8825..bb1dfec 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox

@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2020 ARM Limited.
+/// Copyright (c) 2017-2020 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -237,6 +237,77 @@
 
 @subsection S2_2_changelog Changelog
 
+v20.08 Public major release
+ - Various bug fixes.
+ - Various optimisations.
+ - Added new data type QASYMM8_SIGNED support for:
+   - @ref CLArgMinMaxLayer
+   - @ref CLArgMinMaxLayerKernel
+ - Added new data type U8 support for:
+   - @ref NECropKernel
+   - @ref CLCropKernel
+ - Added aligh_corner support for nearest neighbor interpolation in:
+   - @ref NEScaleKernel
+   - @ref CLScaleKernel
+ - New OpenCL kernels / functions:
+   - @ref CLMaxUnpoolingLayerKernel
+ - New NEON kernels / functions:
+   - @ref NEMaxUnpoolingLayerKernel
+ - New graph example:
+   - graph_yolov3_output_detector
+ - GEMMTuner improvements:
+   - Added fp16 support
+   - Output json files for easier integration
+   - Enabled tuning for export_to_cl_image_rhs option for RHS tensors
+   - More robust script for running benchmarks
+ - Removed padding from:
+   - @ref NEPixelWiseMultiplicationKernel
+   - @ref NEHeightConcatenateLayerKernel
+   - @ref NEThresholdKernel
+   - @ref NEBatchConcatenateLayerKernel
+   - @ref NETransposeKernel
+   - @ref NEBatchNormalizationLayerKernel
+   - @ref NEArithmeticSubtractionKernel
+   - @ref NEBoundingBoxTransformKernel
+   - @ref NELogits1DMaxKernel
+   - @ref NELogits1DSoftmaxKernel
+   - @ref NEROIPoolingLayerKernel
+   - @ref NEROIAlignLayerKernel
+   - @ref NEYOLOLayerKernel
+   - @ref NEUpsampleLayerKernel
+   - @ref NEFloorKernel
+   - @ref NEWidthConcatenateLayerKernel
+   - @ref NEDepthConcatenateLayerKernel
+   - @ref NENormalizationLayerKernel
+   - @ref NEL2NormalizeLayerKernel
+   - @ref NEFillArrayKernel
+   - @ref NEDepthConvertLayerKernel
+   - @ref NERangeKernel
+   - @ref NEPriorBoxLayer
+ - Removedd OpenCL kernels / functions:
+   - CLGEMMLowpQuantizeDownInt32ToUint8Scale
+   - CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat
+ - Removed NEON kernels / functions:
+   - NEGEMMLowpQuantizeDownInt32ToUint8Scale
+   - NEGEMMMatrixAccumulateBiasesKernel
+ - Deprecated functions / interfaces:
+   - Non-descriptor based interfaces for @ref NEThreshold, @ref CLThreshold
+   - Non-descriptor based interfaces for @ref NEScale, @ref CLScale and @ref GCScale
+   - In @ref NESoftmaxLayer, @ref NELogSoftmaxLayer, @ref CLSoftmaxLayer, @ref CLLogSoftmaxLayer and @ref GCSoftmaxLayer :
+      The default "axis" value for @ref CLSoftmaxLayer, @ref CLLogSoftmaxLayer and @ref GCSoftmaxLayer is changed from 1 to 0.
+      Only axis 0 is supported.
+      The default "axis" value for @ref NESoftmaxLayer, @ref NELogSoftmaxLayer is changed from 1 to 0.
+      Only axis 0 is supported.
+ - The support for quantized data types has been removed from @ref CLLogSoftmaxLayer due to implementation complexity.
+ - Removed padding requirement for the input (e.g. LHS of GEMM) and output in @ref CLGEMMMatrixMultiplyNativeKernel, @ref CLGEMMMatrixMultiplyReshapedKernel, @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel and @ref CLIm2ColKernel (NHWC only)
+   - This change allows to use @ref CLGEMMConvolutionLayer without extra padding for the input and output.
+   - Only the weights/bias of @ref CLGEMMConvolutionLayer could require padding for the computation.
+   - Only on Arm Mali Midgard GPUs, @ref CLGEMMConvolutionLayer could require padding since @ref CLGEMMMatrixMultiplyKernel is called and currently requires padding.
+ - Added support for exporting the OpenCL buffer object to the OpenCL image object in @ref CLGEMMMatrixMultiplyReshapedKernel and @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.
+   - This support allows to export the OpenCL buffer used for the reshaped RHS matrix to the OpenCL image object.
+   - The padding requirement for the OpenCL image object is considered into the @ref CLGEMMReshapeRHSMatrixKernel.
+   - The reshaped RHS matrix stores the weights when GEMM is used to accelerate @ref CLGEMMConvolutionLayer.
+
 v20.05 Public major release
  - Various bug fixes.
  - Various optimisations.
@@ -330,7 +401,7 @@
      - @ref NEElementwiseMin
      - @ref NEElementwiseSquaredDiff
      - @ref NEFullyConnectedLayer
-     - @ref NEGEMMMatrixVectorMultiplyKernel
+     - NEGEMMMatrixVectorMultiplyKernel
      - @ref NEPixelWiseMultiplication
      - @ref NEPoolingLayer
      - @ref NEPReluLayer
@@ -902,7 +973,7 @@
  - New NEON kernels / functions
     - arm_compute::NEGEMMLowpAArch64A53Kernel / arm_compute::NEGEMMLowpAArch64Kernel / arm_compute::NEGEMMLowpAArch64V8P4Kernel / arm_compute::NEGEMMInterleavedBlockedKernel / arm_compute::NEGEMMLowpAssemblyMatrixMultiplyCore
     - arm_compute::NEHGEMMAArch64FP16Kernel
-    - @ref NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / @ref NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer
+    - @ref NEDepthwiseConvolutionLayer3x3Kernel / NEDepthwiseIm2ColKernel / NEGEMMMatrixVectorMultiplyKernel / NEDepthwiseVectorToTensorKernel / @ref NEDepthwiseConvolutionLayer
     - @ref NEGEMMLowpOffsetContributionKernel / @ref NEGEMMLowpMatrixAReductionKernel / @ref NEGEMMLowpMatrixBReductionKernel / @ref NEGEMMLowpMatrixMultiplyCore
     - @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel / @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
     - NEWinogradLayer / NEWinogradLayerKernel
@@ -1024,14 +1095,14 @@
    - @ref NETransposeKernel / @ref NETranspose
    - @ref NELogits1DMaxKernel, NELogits1DShiftExpSumKernel, NELogits1DNormKernel / @ref NESoftmaxLayer
    - @ref NEIm2ColKernel, @ref NECol2ImKernel, NEConvolutionLayerWeightsReshapeKernel / @ref NEConvolutionLayer
-   - @ref NEGEMMMatrixAccumulateBiasesKernel / @ref NEFullyConnectedLayer
+   - NEGEMMMatrixAccumulateBiasesKernel / @ref NEFullyConnectedLayer
    - @ref NEGEMMLowpMatrixMultiplyKernel / NEGEMMLowp
 
 v17.03 Sources preview
  - New OpenCL kernels / functions:
    - @ref CLGradientKernel, @ref CLEdgeNonMaxSuppressionKernel, @ref CLEdgeTraceKernel / @ref CLCannyEdge
    - GEMM refactoring + FP16 support: CLGEMMInterleave4x4Kernel, CLGEMMTranspose1xWKernel, @ref CLGEMMMatrixMultiplyKernel, CLGEMMMatrixAdditionKernel / @ref CLGEMM
-   - @ref CLGEMMMatrixAccumulateBiasesKernel / @ref CLFullyConnectedLayer
+   - CLGEMMMatrixAccumulateBiasesKernel / @ref CLFullyConnectedLayer
    - @ref CLTransposeKernel / @ref CLTranspose
    - @ref CLLKTrackerInitKernel, @ref CLLKTrackerStage0Kernel, @ref CLLKTrackerStage1Kernel, @ref CLLKTrackerFinalizeKernel / @ref CLOpticalFlow
    - @ref CLNormalizationLayerKernel / @ref CLNormalizationLayer
@@ -1043,7 +1114,7 @@
 
 v17.02.1 Sources preview
  - New OpenCL kernels / functions:
-   - @ref CLLogits1DMaxKernel, @ref CLLogits1DShiftExpSumKernel, @ref CLLogits1DNormKernel / @ref CLSoftmaxLayer
+   - CLLogits1DMaxKernel, CLLogits1DShiftExpSumKernel, @ref CLLogits1DNormKernel / @ref CLSoftmaxLayer
    - @ref CLPoolingLayerKernel / @ref CLPoolingLayer
    - @ref CLIm2ColKernel, @ref CLCol2ImKernel, CLConvolutionLayerWeightsReshapeKernel / @ref CLConvolutionLayer
    - @ref CLRemapKernel / @ref CLRemap
@@ -1327,13 +1398,13 @@
 
 i.e. to natively compile the "graph_lenet" example for Linux 64bit:
 
-	g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++11 L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
+	g++ examples/graph_lenet.cpp utils/Utils.cpp utils/GraphUtils.cpp utils/CommonGraphOptions.cpp -I. -Iinclude -std=c++11 -L. -larm_compute_graph -larm_compute -larm_compute_core -Wl,--allow-shlib-undefined -o graph_lenet
 
 (notice the only difference with the 32 bit command is that we don't need the -mfpu option)
 
 @note If compiling using static libraries, this order must be followed when linking: arm_compute_graph_static, arm_compute, arm_compute_core
 
-@note These two commands assume libarm_compute.so is available in your library path, if not add the path to it using -L
+@note These two commands assume libarm_compute.so is available in your library path, if not add the path to it using -L (e.g. -Llib/linux-arm64-v8a-neon-cl-asserts/)
 @note You might need to export the path to OpenCL library as well in your LD_LIBRARY_PATH if Compute Library was built with OpenCL enabled.
 
 To run the built executable simply run:

diff --git a/docs/01_library.dox b/docs/01_library.dox
index d09f928..ea29b75 100644
--- a/docs/01_library.dox
+++ b/docs/01_library.dox

@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2020 ARM Limited.
+/// Copyright (c) 2017-2020 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///

diff --git a/docs/02_tests.dox b/docs/02_tests.dox
index b636880..a813844 100644
--- a/docs/02_tests.dox
+++ b/docs/02_tests.dox

@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2020 ARM Limited.
+/// Copyright (c) 2017-2020 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///

diff --git a/docs/03_scripts.dox b/docs/03_scripts.dox
index e6c19e5..efa6fa9 100644
--- a/docs/03_scripts.dox
+++ b/docs/03_scripts.dox

@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2017-2019 ARM Limited.
+/// Copyright (c) 2017-2019 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///

diff --git a/docs/04_adding_operator.dox b/docs/04_adding_operator.dox
index 66ae9a6..c40aaa3 100644
--- a/docs/04_adding_operator.dox
+++ b/docs/04_adding_operator.dox

@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2018-2019 ARM Limited.
+/// Copyright (c) 2018-2019 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///

diff --git a/docs/05_contribution_guidelines.dox b/docs/05_contribution_guidelines.dox
index 7c919eb..abe0bc9 100644
--- a/docs/05_contribution_guidelines.dox
+++ b/docs/05_contribution_guidelines.dox

@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2019 ARM Limited.
+/// Copyright (c) 2019 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///

diff --git a/docs/06_functions_list.dox b/docs/06_functions_list.dox
index 1401ff6..ac94461 100644
--- a/docs/06_functions_list.dox
+++ b/docs/06_functions_list.dox

@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2018-2019 ARM Limited.
+/// Copyright (c) 2018-2019 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -91,7 +91,6 @@
         - @ref NEFullyConnectedLayerReshapeWeights
         - @ref NEGather
         - @ref NEGEMMInterleave4x4
-        - @ref NEGEMMLowpQuantizeDownInt32ToUint8Scale
         - @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
         - @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
         - @ref NEGEMMTranspose1xW
@@ -160,6 +159,7 @@
     - @ref NELSTMLayer
     - @ref NELSTMLayerQuantized
     - @ref NEQLSTMLayer
+    - @ref NEMaxUnpoolingLayer
     - @ref NEMeanStdDev
     - @ref NEMinMaxLocation
     - @ref NENormalizationLayer
@@ -234,6 +234,7 @@
     - @ref CLLSTMLayer
     - @ref CLLSTMLayerQuantized
     - @ref CLQLSTMLayer
+    - @ref CLMaxUnpoolingLayer
     - @ref CLMeanStdDev
     - @ref CLMinMaxLocation
     - @ref CLNormalizationLayer
@@ -297,10 +298,8 @@
         - @ref CLFullyConnectedLayerReshapeWeights
         - @ref CLGather
         - @ref CLGaussian3x3
-        - @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale
         - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
         - @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
-        - @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat
         - @ref CLMagnitude
         - @ref CLMeanStdDevNormalizationLayer
         - @ref CLMedian3x3

diff --git a/docs/07_errata.dox b/docs/07_errata.dox
index 16541c5..2d35e67 100644
--- a/docs/07_errata.dox
+++ b/docs/07_errata.dox

@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2019 ARM Limited.
+/// Copyright (c) 2019 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///

diff --git a/docs/Doxyfile b/docs/Doxyfile
index b701c42..ef8966c 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile

@@ -38,7 +38,7 @@
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 20.05
+PROJECT_NUMBER         = 20.08
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a

diff --git a/examples/SConscript b/examples/SConscript
index 8971d3c..e28761c 100644
--- a/examples/SConscript
+++ b/examples/SConscript

@@ -1,4 +1,4 @@
-# Copyright (c) 2017 ARM Limited.
+# Copyright (c) 2017 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -91,9 +91,15 @@
     for file in Glob("./gemm_tuner/cl_*.cpp"):
         example = os.path.basename(os.path.splitext(str(file))[0])
         example = os.path.join("gemm_tuner", example)
-        prog = examples_env.Program(example, ["{}.cpp".format(example), utils, gemm_tuner_common_options], CPPDEFINES=['ARM_COMPUTE_CL'], LIBS = examples_libs + arm_compute_libs)
-        Depends(prog, arm_compute_dependency)
-        prog = install_bin(prog)
+        if env['os'] in ['android', 'bare_metal'] or env['standalone']:
+            prog = examples_env.Program(example, ["{}.cpp".format(example), utils, gemm_tuner_common_options], CPPDEFINES=['ARM_COMPUTE_CL'], LIBS = examples_libs + arm_compute_graph_libs, LINKFLAGS=examples_env["LINKFLAGS"]+['-Wl,--whole-archive',graph_dependency,'-Wl,--no-whole-archive', '-fstack-protector-strong'] )
+            Depends(prog, graph_dependency)
+            prog = install_bin(prog)
+        else:
+            #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
+            prog = examples_env.Program(example, ["{}.cpp".format(example), utils, gemm_tuner_common_options], CPPDEFINES=['ARM_COMPUTE_CL'], LIBS = examples_libs + arm_compute_graph_libs, LINKFLAGS=examples_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'] )
+            Depends(prog, graph_dependency)
+            prog = install_bin(prog)
         alias = examples_env.Alias(example, prog)
         Default(alias)
 
@@ -120,20 +126,3 @@
         prog = install_bin(prog)
         alias = examples_env.Alias(example, prog)
         Default(alias)
-
-#FIXME Delete 3rdparty builds before release
-for file in Glob("#3rdparty/examples/graph_*.cpp"):
-    example = os.path.basename(os.path.splitext(str(file))[0])
-    prog = None
-
-    if env['os'] in ['android', 'bare_metal'] or env['standalone']:
-        prog = examples_env.Program(example, [examples_env.Object(source=file, target=example), utils, graph_utils], LIBS = examples_libs + arm_compute_graph_libs, LINKFLAGS=examples_env["LINKFLAGS"]+['-Wl,--whole-archive',graph_dependency,'-Wl,--no-whole-archive', '-fstack-protector-strong'])
-        Depends(prog, graph_dependency)
-        prog = install_bin(prog)
-    else:
-        #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
-        prog = examples_env.Program(example, [examples_env.Object(source=file, target=example), utils, graph_utils], LIBS = examples_libs + arm_compute_graph_libs, LINKFLAGS=examples_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'] )
-        Depends(prog, graph_dependency)
-        prog = install_bin(prog)
-    alias = examples_env.Alias(example, prog)
-    Default(alias)

diff --git a/examples/cl_cache.cpp b/examples/cl_cache.cpp
index a1a2d25..37e1c27 100644
--- a/examples/cl_cache.cpp
+++ b/examples/cl_cache.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/cl_convolution.cpp b/examples/cl_convolution.cpp
index f2d19ef..34b3466 100644
--- a/examples/cl_convolution.cpp
+++ b/examples/cl_convolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/cl_events.cpp b/examples/cl_events.cpp
index a9c508a..f578180 100644
--- a/examples/cl_events.cpp
+++ b/examples/cl_events.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,7 +67,7 @@
         tmp_median_gauss.allocator()->init(dst_info);
 
         //Configure the functions:
-        scale.configure(&src, &tmp_scale_median, InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::REPLICATE);
+        scale.configure(&src, &tmp_scale_median, ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::REPLICATE });
         median.configure(&tmp_scale_median, &tmp_median_gauss, BorderMode::REPLICATE);
         gauss.configure(&tmp_median_gauss, &dst, BorderMode::REPLICATE);
 

diff --git a/examples/cl_sgemm.cpp b/examples/cl_sgemm.cpp
index 8e0263d..7d3b4fe 100644
--- a/examples/cl_sgemm.cpp
+++ b/examples/cl_sgemm.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/gc_absdiff.cpp b/examples/gc_absdiff.cpp
index 6793df0..701e43f 100644
--- a/examples/gc_absdiff.cpp
+++ b/examples/gc_absdiff.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/gc_dc.cpp b/examples/gc_dc.cpp
index 6d09eba..ac235fd 100644
--- a/examples/gc_dc.cpp
+++ b/examples/gc_dc.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.cpp b/examples/gemm_tuner/CommonGemmExampleOptions.cpp
index a93d019..2e15e62 100644
--- a/examples/gemm_tuner/CommonGemmExampleOptions.cpp
+++ b/examples/gemm_tuner/CommonGemmExampleOptions.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
     os << "N : " << common_params.N << std::endl;
     os << "K : " << common_params.K << std::endl;
     os << "B : " << common_params.B << std::endl;
+    os << "Data type : " << common_params.data_type << std::endl;
     return os;
 }
 
@@ -42,22 +43,33 @@
       M(parser.add_positional_option<SimpleOption<size_t>>("M", 100)),
       N(parser.add_positional_option<SimpleOption<size_t>>("N", 100)),
       K(parser.add_positional_option<SimpleOption<size_t>>("K", 50)),
-      B(parser.add_positional_option<SimpleOption<size_t>>("B", 1))
+      B(parser.add_positional_option<SimpleOption<size_t>>("B", 1)),
+      data_type()
 {
+    const std::set<DataType> supported_data_types
+    {
+        DataType::F16,
+        DataType::F32,
+    };
+
+    data_type = parser.add_option<EnumOption<DataType>>("type", supported_data_types, DataType::F32);
+
     help->set_help("Show this help message.");
     M->set_help("Number of lhs matrix rows.");
     N->set_help("Number of rhs matrix columns.");
     K->set_help("Number of lhs matrix columns/rhs matrix rows.");
     B->set_help("Batch size.");
+    data_type->set_help("Data type to use");
 }
 
 CommonGemmExampleParams consume_common_gemm_example_parameters(const CommonGemmExampleOptions &options)
 {
     CommonGemmExampleParams common_params;
-    common_params.M = options.M->value();
-    common_params.N = options.N->value();
-    common_params.K = options.K->value();
-    common_params.B = options.B->value();
+    common_params.M         = options.M->value();
+    common_params.N         = options.N->value();
+    common_params.K         = options.K->value();
+    common_params.B         = options.B->value();
+    common_params.data_type = options.data_type->value();
     return common_params;
 }
 } // namespace gemm_tuner

diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.h b/examples/gemm_tuner/CommonGemmExampleOptions.h
index 5f079ab..04a8f22 100644
--- a/examples/gemm_tuner/CommonGemmExampleOptions.h
+++ b/examples/gemm_tuner/CommonGemmExampleOptions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,9 @@
 #ifndef ARM_COMPUTE_EXAMPLES_GEMM_TUNER_COMMON_GEMM_EXAMPLE_OPTIONS
 #define ARM_COMPUTE_EXAMPLES_GEMM_TUNER_COMMON_GEMM_EXAMPLE_OPTIONS
 
+#include "arm_compute/core/Types.h"
+#include "arm_compute/graph/TypeLoader.h"
+#include "utils/TypePrinter.h"
 #include "utils/command_line/CommandLineOptions.h"
 #include "utils/command_line/CommandLineParser.h"
 
@@ -32,10 +35,11 @@
 /** Structure holding all the common gemm example parameters */
 struct CommonGemmExampleParams
 {
-    size_t M{ 100 }; /**< Number of lhs matrix rows */
-    size_t N{ 100 }; /**< Number of rhs matrix columns */
-    size_t K{ 50 };  /**< Number of lhs matrix columns/rhs matrix rows */
-    size_t B{ 1 };   /**< Batch size */
+    size_t                M{ 100 };                                /**< Number of lhs matrix rows */
+    size_t                N{ 100 };                                /**< Number of rhs matrix columns */
+    size_t                K{ 50 };                                 /**< Number of lhs matrix columns/rhs matrix rows */
+    size_t                B{ 1 };                                  /**< Batch size */
+    arm_compute::DataType data_type{ arm_compute::DataType::F32 }; /**< Data type */
 };
 
 /** Formatted output of the CommonGemmExampleParams type
@@ -75,11 +79,12 @@
     /** Default destructor */
     ~CommonGemmExampleOptions() = default;
 
-    arm_compute::utils::ToggleOption         *help; /**< Show help option */
-    arm_compute::utils::SimpleOption<size_t> *M;    /**< Number of lhs matrix rows option */
-    arm_compute::utils::SimpleOption<size_t> *N;    /**< Number of rhs matrix columns option */
-    arm_compute::utils::SimpleOption<size_t> *K;    /**< Number of lhs matrix columns/rhs matrix rows option */
-    arm_compute::utils::SimpleOption<size_t> *B;    /**< Batch size option */
+    arm_compute::utils::ToggleOption                      *help;      /**< Show help option */
+    arm_compute::utils::SimpleOption<size_t>              *M;         /**< Number of lhs matrix rows option */
+    arm_compute::utils::SimpleOption<size_t>              *N;         /**< Number of rhs matrix columns option */
+    arm_compute::utils::SimpleOption<size_t>              *K;         /**< Number of lhs matrix columns/rhs matrix rows option */
+    arm_compute::utils::SimpleOption<size_t>              *B;         /**< Batch size option */
+    arm_compute::utils::EnumOption<arm_compute::DataType> *data_type; /**< Data type */
 };
 
 /** Consumes the common gemm example options and creates a structure containing all information

diff --git a/examples/gemm_tuner/GemmTuner.py b/examples/gemm_tuner/GemmTuner.py
index 29c414c..3e75051 100644
--- a/examples/gemm_tuner/GemmTuner.py
+++ b/examples/gemm_tuner/GemmTuner.py

@@ -1,4 +1,4 @@
-# Copyright (c) 2019 ARM Limited.
+# Copyright (c) 2019-2020 ARM Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -41,18 +41,21 @@
 Strategy = Enum("Strategy", ["Native", "ReshapedOnlyRHS", "Reshaped"])
 
 # Gemm parameter
+
+
 class GEMMParam(NamedTuple):
     M: int  # Number of lhs matrix rows
     N: int  # Number of rhs matrix columns
     K: int  # Number of lhs matrix columns/rhs matrix rows
     B: int  # Batch size
+    data_type: str  # Data type
 
-    @staticmethod
-    def parse_from_strs(*args):
-        return GEMMParam(*map(int, args))
+    @classmethod
+    def parse_from_strs(cls, *M_N_K_B, data_type):
+        return cls(*map(int, M_N_K_B), str(data_type))
 
     def __str__(self):
-        return "-".join(map(str, self))
+        return ",".join(map(str, self))
 
 
 # Gemm configuration for strategy Native
@@ -61,13 +64,13 @@
     n0: int  # Number of columns processed by the matrix multiplication
     k0: int  # Number of partial accumulations performed by the matrix multiplication
 
-    @staticmethod
-    def parse_from_strs(*args):
-        *mnk, = map(int, args)
-        return NativeGEMMConfig(*mnk)
+    @classmethod
+    def parse_from_strs(cls, *args):
+        (*mnk,) = map(int, args)
+        return cls(*mnk)
 
     def __str__(self):
-        return "-".join(map(str, self))
+        return ",".join(map(str, self))
 
 
 # Gemm configuration for strategy Reshaped Only RHS
@@ -75,19 +78,25 @@
     m0: int  # Number of rows processed by the matrix multiplication
     n0: int  # Number of columns processed by the matrix multiplication
     k0: int  # Number of partial accumulations performed by the matrix multiplication
-    h0: int  # Number of horizontal blocks of size (k0xn0) stored on the same output row
-    interleave_rhs: bool  # Interleave rhs matrix (1) / Do not interleave rhs matrix (0)
-    transpose_rhs: bool  # Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)
+    # Number of horizontal blocks of size (k0xn0) stored on the same output row
+    h0: int
+    # Interleave rhs matrix (1) / Do not interleave rhs matrix (0)
+    interleave_rhs: bool
+    # Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)
+    transpose_rhs: bool
+    # Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)
+    export_to_cl_image_rhs: bool
 
-    @staticmethod
-    def parse_from_strs(*args):
-        *mnkh, interleave_rhs, transpose_rhs = map(int, args)
+    @classmethod
+    def parse_from_strs(cls, *args):
+        (*mnkh, interleave_rhs, transpose_rhs, export_to_cl_image_rhs,) = map(int, args)
         interleave_rhs = interleave_rhs == 1
         transpose_rhs = transpose_rhs == 1
-        return ReshapedOnlyRHSGEMMConfig(*mnkh, interleave_rhs, transpose_rhs)
+        export_to_cl_image_rhs = export_to_cl_image_rhs == 1
+        return cls(*mnkh, interleave_rhs, transpose_rhs, export_to_cl_image_rhs)
 
     def __str__(self):
-        return "-".join(map(str, self))
+        return ",".join(map(str, self))
 
 
 # Gemm configuration for strategy Reshaped
@@ -95,55 +104,90 @@
     m0: int  # Number of rows processed by the matrix multiplication
     n0: int  # Number of columns processed by the matrix multiplication
     k0: int  # Number of partial accumulations performed by the matrix multiplication
-    v0: int  # Number of vertical blocks of size (m0xk0) stored on the same output row
-    h0: int  # Number of horizontal blocks of size (k0xn0) stored on the same output row
-    interleave_lhs: bool  # Interleave lhs matrix (1) / Do not interleave lhs matrix (0)
-    interleave_rhs: bool  # Interleave rhs matrix (1) / Do not interleave rhs matrix (0)
-    transpose_rhs: bool  # Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)
+    # Number of vertical blocks of size (m0xk0) stored on the same output row
+    v0: int
+    # Number of horizontal blocks of size (k0xn0) stored on the same output row
+    h0: int
+    # Interleave lhs matrix (1) / Do not interleave lhs matrix (0)
+    interleave_lhs: bool
+    # Interleave rhs matrix (1) / Do not interleave rhs matrix (0)
+    interleave_rhs: bool
+    # Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)
+    transpose_rhs: bool
+    # Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)
+    export_to_cl_image_rhs: bool
 
-    @staticmethod
-    def parse_from_strs(*args):
-        *mnkvh, interleave_lhs, interleave_rhs, transpose_rhs = map(int, args)
+    @classmethod
+    def parse_from_strs(cls, *args):
+        (*mnkvh, interleave_lhs, interleave_rhs, transpose_rhs, export_to_cl_image_rhs,) = map(int, args)
         interleave_lhs = interleave_lhs == 1
         interleave_rhs = interleave_rhs == 1
         transpose_rhs = transpose_rhs == 1
-        return ReshapedGEMMConfig(*mnkvh, interleave_lhs, interleave_rhs, transpose_rhs)
+        export_to_cl_image_rhs = export_to_cl_image_rhs == 1
+        return cls(*mnkvh, interleave_lhs, interleave_rhs, transpose_rhs, export_to_cl_image_rhs)
 
     def __str__(self):
-        return "-".join(map(str, self))
+        return ",".join(map(str, self))
 
 
 # Measurement we take from the benchmark result.
 class Measurement(NamedTuple):
-    opencl_timer_ms: float
+    opencl_timer_ms_reshape: float
+    opencl_timer_ms_kernel: float
+
+    def get_total_ms(self):
+        return self.opencl_timer_ms_reshape + self.opencl_timer_ms_kernel
 
     def is_close_to(self, other, tol):
-        return math.fabs(self.opencl_timer_ms - other.opencl_timer_ms) < tol
+        return math.fabs(self.get_total_ms() - other.get_total_ms()) < tol
 
     def is_better_than(self, other, tol):
-        return self < other and not self.is_close_to(other)
+        return self.get_total_ms() < other.get_total_ms() and not self.is_close_to(
+            other
+        )
 
     def __add__(self, other):
-        return Measurement(self.opencl_timer_ms + other.opencl_timer_ms)
+        return Measurement(
+            self.opencl_timer_ms_reshape + other.opencl_timer_ms_reshape,
+            self.opencl_timer_ms_kernel + other.opencl_timer_ms_kernel,
+        )
 
     def __sub__(self, other):
-        return Measurement(self.opencl_timer_ms - other.opencl_timer_ms)
+        return Measurement(
+            self.opencl_timer_ms_reshape - other.opencl_timer_ms_reshape,
+            self.opencl_timer_ms_kernel - other.opencl_timer_ms_kernel,
+        )
 
     def __mul__(self, other):
-        return Measurement(self.opencl_timer_ms * other.opencl_timer_ms)
+        return Measurement(
+            self.opencl_timer_ms_reshape * other.opencl_timer_ms_reshape,
+            self.opencl_timer_ms_kernel * other.opencl_timer_ms_kernel,
+        )
 
     def __floordiv__(self, other):
-        return Measurement(self.opencl_timer_ms // other.opencl_timer_ms)
+        return Measurement(
+            self.opencl_timer_ms_reshape // other.opencl_timer_ms_reshape,
+            self.opencl_timer_ms_kernel // other.opencl_timer_ms_kernel,
+        )
 
     def __truediv__(self, other):
-        return Measurement(self.opencl_timer_ms / other.opencl_timer_ms)
+        return Measurement(
+            self.opencl_timer_ms_reshape / other.opencl_timer_ms_reshape,
+            self.opencl_timer_ms_kernel / other.opencl_timer_ms_kernel,
+        )
 
     def __pow__(self, power):
-        return Measurement(self.opencl_timer_ms ** power)
+        return Measurement(
+            self.opencl_timer_ms_reshape ** power, self.opencl_timer_ms_kernel ** power
+        )
+
+    def __str__(self):
+        return ",".join(map(str, self))
 
 
 # GEMMConfig Type
-GEMMConfigT = Union[NativeGEMMConfig, ReshapedOnlyRHSGEMMConfig, ReshapedGEMMConfig]
+GEMMConfigT = Union[NativeGEMMConfig,
+                    ReshapedOnlyRHSGEMMConfig, ReshapedGEMMConfig]
 
 
 # Representation of the benchmark result from a single experiment
@@ -154,24 +198,6 @@
     measurement: Measurement
 
 
-# Representation of a single row of BenchmarkResult in CSV
-# NOTE: In the CSV representation, we merge all fields of Gemm Config into a single field "GEMMConfig", but keep the
-# fields of GEMMParam and Measurement
-# The example entry including header would look like:
-# M   , N , K  , B, Strategy         , GEMMConfig       , OpenCLTimer_MS
-# 1225, 32, 192, 1, Reshaped         , 4-4-4-3-1-1-1-0  , 0.3309
-BenchmarkResultCSVRow = namedtuple(
-    "BenchmarkResultCSVRow", GEMMParam._fields + ("Strategy", "GEMMConfig") + Measurement._fields
-)
-
-
-def benchmark_result_2_csv_row(result: BenchmarkResult) -> BenchmarkResultCSVRow:
-    """ Convert a BenchmarkResult into its CSV row form """
-    return BenchmarkResultCSVRow(
-        *(result.gemm_param + (result.strategy.name, str(result.gemm_config)) + result.measurement)
-    )
-
-
 class GEMMBenchmarkResultRecorder:
     """ A recorder that records and organises GEMM Benchmark results, and produces various reports on the record.
     """
@@ -210,7 +236,9 @@
             best_gc_set = best_gc_sets.setdefault((gemm_param, strategy), [])
             best_gc_set.append((gemm_config, measurement))
             # Sort the best config set (list)
-            best_gc_set = sorted(best_gc_set, key=lambda gc_and_m: gc_and_m[1])
+            best_gc_set = sorted(
+                best_gc_set, key=lambda gc_and_m: gc_and_m[1].get_total_ms()
+            )
             # Filter out configs that are beyond tolerance to the best GEMMConfig's measurement
             best_gc, best_m = best_gc_set[0]
             best_gc_set_new = [
@@ -228,9 +256,14 @@
         """ Get the best GEMMConfig set per GEMMParam per Strategy, and flatten the result into a sequence
         of BenchmarkResults
         """
-        for (gemm_param, strategy), best_gc_sets in self.get_best_gemm_configs().items():
+        for (
+            (gemm_param, strategy),
+            best_gc_sets,
+        ) in self.get_best_gemm_configs().items():
             for best_gemm_config, best_measurement in best_gc_sets:
-                yield BenchmarkResult(gemm_param, strategy, best_gemm_config, best_measurement)
+                yield BenchmarkResult(
+                    gemm_param, strategy, best_gemm_config, best_measurement
+                )
 
     def get_config_distributions(self):
         """ Return GEMMConfigDistribution for each strategy
@@ -244,38 +277,72 @@
 
         return gemm_config_distributions
 
-    def save_to_csvs(self, out_dir, only_best_config=True):
-        """ Save records to an output directory of csv files.
-        The directory is organized such that each strategy gets its own CSV file.
+    def get_best_gemm_strategies(self):
+        """ Get the best Stratey per GEMMParam
+        """
+        all_results: Dict[GEMMParam, List[Tuple[Strategy, Measurement]]] = defaultdict(
+            list
+        )
+
+        best_strategies: Dict[GEMMParam, Strategy] = {}
+
+        for gemm_param, strategy, gemm_config, measurement in self.get_record():
+            all_results[gemm_param].append((strategy, measurement))
+
+        for gemm_param, results_set in all_results.items():
+            # Sort the best results set (list)
+            results_set = sorted(
+                results_set, key=lambda s_and_m: s_and_m[1].get_total_ms()
+            )
+            # Select best Strategy
+            best_s, best_m = results_set[0]
+            best_strategies[gemm_param] = best_s
+
+        return best_strategies
+
+    def save_to_jsons(self, out_dir, only_best_config=True):
+        """ Save records to an output directory of JSON files.
+        The directory is organized such that each strategy gets its own JSON file.
+        The directory also includes a JSON file to define the best strategy per GEMM Param.
         """
         if not os.path.exists(out_dir):
-            logging.info("Output directory {} does not exist. Creating...".format(out_dir))
-            os.mkdir(out_dir)
-        for strategy in self._strategies:
-            out_csv_path = os.path.join(out_dir, strategy.name)
-            if os.path.exists(out_csv_path):
-                overwrite = (
-                    input(
-                        "Output CSV {} already exists. Overwrite? [Y/N]: ".format(out_csv_path)
-                    ).lower()
-                    == "y"
-                )
-                if not overwrite:
-                    logging.info("Skipping {}".format(out_csv_path))
-                    continue
-            logging.info("Saving csv file to {}".format(out_csv_path))
-            record = (
-                self.get_best_gemm_configs_as_sequence() if only_best_config else self.get_record()
+            logging.info(
+                "Output directory {} does not exist. Creating...".format(
+                    out_dir)
             )
-            with open(out_csv_path, "w") as f:
-                csv_writer = csv.DictWriter(f, fieldnames=BenchmarkResultCSVRow._fields)
-                csv_writer.writeheader()
-                csv_writer.writerows(
-                    benchmark_result_2_csv_row(res)._asdict()
-                    for res in record
-                    if res.strategy == strategy
+            os.mkdir(out_dir)
+
+        out_json_path = os.path.join(out_dir, "gemm_type_selection.json")
+        if check_out_path(out_json_path):
+            results = self.get_best_gemm_strategies()
+            results = {str(key): value.name for key, value in results.items()}
+            dump_json(out_json_path, results)
+
+        for strategy in self._strategies:
+            out_json_path = os.path.join(
+                out_dir, ("gemm_config_" + strategy.name.lower() + ".json")
+            )
+            if check_out_path(out_json_path):
+                record = (
+                    self.get_best_gemm_configs_as_sequence()
+                    if only_best_config
+                    else self.get_record()
                 )
-            logging.info("Saved")
+                results = defaultdict(list)
+                for res in record:
+                    if res.strategy == strategy:
+                        results[str(res.gemm_param)].append(
+                            {
+                                "GEMMConfig": str(res.gemm_config),
+                                "OpenCL_Timer_ms_reshape": str(
+                                    res.measurement.opencl_timer_ms_reshape
+                                ),
+                                "OpenCL_Timer_ms_kernel": str(
+                                    res.measurement.opencl_timer_ms_kernel
+                                ),
+                            }
+                        )
+                dump_json(out_json_path, results)
 
     def summary(self, sum_level=SummaryLevel.Short):
         """ Return the summary string of the record
@@ -314,9 +381,9 @@
     def __init__(self):
         """ Initializer
         """
-        self._gemm_config_dist: Dict[GEMMConfig, List[Tuple[GEMMParam, Measurement]]] = defaultdict(
-            list
-        )
+        self._gemm_config_dist: Dict[
+            GEMMConfig, List[Tuple[GEMMParam, Measurement]]
+        ] = defaultdict(list)
         self._gemm_config_freq = Counter()
 
     def add(self, benchmark_result: BenchmarkResult):
@@ -376,14 +443,15 @@
 #           GEMMParam + GEMMConfig
 #   in that order.
 # For example, the example args of running a reshaped rhs only example could be:
-#   100,100,100,1, 4, 4, 4, 1,             1,            1
-#   M  ,N  ,K,  B,m0,n0,k0,h0,interleave_rhs,transpose_rhs
-#   <-GEMMParam-><-------------GEMMConfig-------------->
+#   100,100,100,1, 4, 4, 4, 1,             1,            1,                     0
+#   M  ,N  ,K,  B,m0,n0,k0,h0,interleave_rhs,transpose_rhs,export_to_cl_image_rhs
+#   <-GEMMParam-><-------------GEMMConfig--------------------------------------->
 # Note that the test strategy_name == strategy.name is in place to avoid unwanted enum aliases
 GEMM_EXAMPLE_ARGS_FACTORY = {
+    # We ignore the data type field from GEMMParam as that is extracted separately
     strategy: namedtuple(
         "{}_Gemm_Example_Args".format(strategy_name),
-        GEMMParam._fields + GEMM_CONFIG_FACTORY[strategy]._fields,
+        GEMMParam._fields[:-1] + GEMM_CONFIG_FACTORY[strategy]._fields,
     )
     for strategy_name, strategy in Strategy.__members__.items()
     if strategy_name == strategy.name
@@ -398,8 +466,11 @@
 
 
 def parse_benchmark_commandline(commandline: str) -> Dict[str, str]:
-    """ Parse the benchmark example command-line string into a dictionary of command-line agruments
+    """ Parse the benchmark example command-line string into a dictionary of command-line arguments
     """
+    # Separate the data type option from the example_args portion of the string
+    commandline = commandline.replace(",--type=", " --type=")
+
     args = commandline.split()
     # Discard program name
     args = args[1:]
@@ -439,30 +510,47 @@
         # Get gemm params + gemm configs from example args
         benchmark_args = parse_benchmark_commandline(json_res["CommandLine"])
         Gemm_Example_Args_T = GEMM_EXAMPLE_ARGS_FACTORY[strategy]
-        example_args = Gemm_Example_Args_T(*(benchmark_args["example_args"].split(",")))
+        example_args = Gemm_Example_Args_T(
+            *(benchmark_args["example_args"].split(",")))
         # Gemm_Example_Arg consists of GEMMParam first and then GEMMConfig (in that order)
-        gemm_param_fields_len = len(GEMMParam._fields)
-        gemm_param = GEMMParam.parse_from_strs(*example_args[:gemm_param_fields_len])
+        # However data type option is parsed separately from end of options, hence -1 is applied to fields length
+        gemm_param_fields_len = len(GEMMParam._fields) - 1
+        gemm_param = GEMMParam.parse_from_strs(
+            *example_args[:gemm_param_fields_len],
+            data_type = benchmark_args["type"])
         GEMMConfig = GEMM_CONFIG_FACTORY[strategy]
-        gemm_config = GEMMConfig.parse_from_strs(*example_args[gemm_param_fields_len:])
+        gemm_config = GEMMConfig.parse_from_strs(
+            *example_args[gemm_param_fields_len:])
 
         # Get OpenCL_Time_Ms stats
         measurements = list(example_test_data["measurements"].items())
-        # There should only be 1 instrument per run
-        assert len(measurements) == 1
-        measurement_instrument, data = measurements.pop()
-        # Get instrument name and assert that it is the one we expect
-        measurement_instrument_name = measurement_instrument.split("/")[0]
-        assert measurement_instrument_name == "OpenCLTimer"
-        # Take either the minimum or the average of the raw data as the measurement value
-        if measurement_method == "min":
-            measurement_val = min(data["raw"])
-        elif measurement_method == "avg":
-            measurement_val = sum(data["raw"]) / len(data["raw"])
-        else:
-            raise ValueError("Invalid measurement method: {}".format(measurement_method))
+        # For reshaped RHS only we have two measurements (one also for the reshape kernel)
+        # Hence we must parse and sum them
+        measurement_ms_reshape = 0
+        measurement_ms_kernel = 0
+        for single_measurement in measurements:
+            measurement_instrument, data = single_measurement
+            # Get instrument name and assert that it is the one we expect
+            measurement_instrument_name = measurement_instrument.split("/")[0]
+            assert measurement_instrument_name == "OpenCLTimer"
+            # Take either the minimum or the average of the raw data as the measurement value
+            if measurement_method == "min":
+                measurement_val = min(data["raw"])
+            elif measurement_method == "avg":
+                measurement_val = sum(data["raw"]) / len(data["raw"])
+            else:
+                raise ValueError(
+                    "Invalid measurement method: {}".format(measurement_method)
+                )
 
-        measurement = Measurement(measurement_val)
+            measurement_type = measurement_instrument.split("/")[1]
+            if "reshape" in measurement_type.split("_"):
+                measurement_ms_reshape = measurement_val
+            else:
+                measurement_ms_kernel = measurement_val
+
+        measurement = Measurement(
+            measurement_ms_reshape, measurement_ms_kernel)
 
         yield BenchmarkResult(gemm_param, strategy, gemm_config, measurement)
 
@@ -475,15 +563,42 @@
             yield json.load(res_fp)
 
 
+def check_out_path(out_path):
+    if os.path.exists(out_path):
+        overwrite = (
+            input(
+                "Output JSON {} already exists. Overwrite? [Y/N]: ".format(
+                    out_path)
+            ).lower()
+            == "y"
+        )
+        if not overwrite:
+            logging.info("Skipping {}".format(out_path))
+            return False
+    logging.info("Saving JSON file to {}".format(out_path))
+    return True
+
+
+def dump_json(out_path, dict):
+    with open(out_path, "w") as f:
+        json.dump(dict, f)
+    logging.info("Saved")
+
+
 ################################################################################
 # Main
 ################################################################################
 
 
 def main(args):
-    logging.info("Searching best gemm configurations from {}".format(args.benchmark_results_dir))
+    logging.info(
+        "Searching best gemm configurations from {}".format(
+            args.benchmark_results_dir)
+    )
 
-    benchmark_results = extract_benchmark_results(parse_json(args.benchmark_results_dir))
+    benchmark_results = extract_benchmark_results(
+        parse_json(args.benchmark_results_dir)
+    )
 
     # Add all benchmark results to the recorder
     benchmark_result_recorder = GEMMBenchmarkResultRecorder(tol=args.tolerance)
@@ -496,7 +611,8 @@
         recorder_sum_level = GEMMBenchmarkResultRecorder.SummaryLevel.Short
 
     # Print overall summary of the recorded results
-    logging.info(benchmark_result_recorder.summary(sum_level=recorder_sum_level))
+    logging.info(benchmark_result_recorder.summary(
+        sum_level=recorder_sum_level))
 
     # Get GEMM configuration distributions for each strategy
     all_config_dists = benchmark_result_recorder.get_config_distributions()
@@ -508,12 +624,16 @@
         for config, freq in config_dist.frequency():
             logging.debug("{}, {}".format(config, freq))
         logging.info(
-            "Best GEMM Config: {} with std: {}".format(config_dist.best_config(), config_dist.std())
+            "Best GEMM Config: {} with std: {}".format(
+                config_dist.best_config(), config_dist.std()
+            )
         )
 
-    # Save the recorded results to csv files in output directory
+    # Save the recorded results to JSON files in output directory
     if args.output_dir is not None:
-        benchmark_result_recorder.save_to_csvs(args.output_dir, only_best_config=(not args.debug))
+        benchmark_result_recorder.save_to_jsons(
+            args.output_dir, only_best_config=(not args.debug)
+        )
 
 
 if __name__ == "__main__":
@@ -538,7 +658,7 @@
         metavar="PATH",
         action="store",
         type=str,
-        help="Path to directory that holds output csv files. One per strategy",
+        help="Path to directory that holds output JSON files. One for strategy selection and one per strategy for GEMM config selection",
     )
     parser.add_argument(
         "-t",
@@ -550,7 +670,11 @@
         milliseconds. Recommended value: <= 0.1 ms",
     )
     parser.add_argument(
-        "-D", "--debug", dest="debug", action="store_true", help="Enable script debugging output"
+        "-D",
+        "--debug",
+        dest="debug",
+        action="store_true",
+        help="Enable script debugging output",
     )
     args = parser.parse_args()
     logging_level = logging.DEBUG if args.debug else logging.INFO

diff --git a/examples/gemm_tuner/README.md b/examples/gemm_tuner/README.md
index a4cde10..1effd2f 100644
--- a/examples/gemm_tuner/README.md
+++ b/examples/gemm_tuner/README.md

@@ -2,19 +2,77 @@
 
 ## Introduction
 
-This is a set of 2 script tools for tuning the performance of OpenCL GEMM kernels (limited to Convolution layer
-functions only for now).  Specifically, we tune 3 GEMM kernels, each has a different implementation **strategy** of the
-GEMM operation: **native**, **reshaped**, **reshaped only rhs**. The details of these strategies can be found in the
-documentations of the corresponding kernels: **CLGEMMMatrixMultiplyNativeKernel**,
-**CLGEMMMatrixMultiplyReshapedKernel** and **CLGEMMMatrixMultiplyReshapedOnlyRHSKernel**.
+This is a set of tools for tuning the performance of OpenCL GEMM kernels.  Specifically, we tune 3 GEMM kernels, each
+has a different implementation **strategy** of the GEMM operation: **native**, **reshaped**, **reshaped only rhs**.
+The details of these strategies can be found in the documentations of the corresponding kernels:
+**CLGEMMMatrixMultiplyNativeKernel**, **CLGEMMMatrixMultiplyReshapedKernel** and
+**CLGEMMMatrixMultiplyReshapedOnlyRHSKernel**.
 
-The outputs of the tuning process are 1 optimal configuration (called **GEMM Configuration** or **GEMMConfig**, for
-more details see Approach section) for each of the 3 strategies.
+The Tuner consists of 2 scripts and 3 binaries:
+* benchmark_gemm_examples.sh and GemmTuner.py under examples/gemm_tuner, and
+* benchmark_cl_gemm_native, benchmark_cl_gemm_reshaped_rhs_only and benchmark_cl_gemm_reshaped under
+  build/tests/gemm_tuner (you'll need to build the library first)
 
-## Location
-The 2 scripts **benchmark_gemm_examples.sh** and **GemmTuner.py** can be found under $ACL_ROOT/examples/gemm_tuner.
+The inputs to the Tuner are a list of 4 valued tuples we call **GEMM shape** or **GEMMParam** (M, N, K, B, and possibly
+data type). They define the "shape" and other parameters (eg. data type) of a GEMM operation:
+```
+LHS x RHS = DST
+```
+Where LHS is of shape MxK, RHS is of shape KxN and DST is of shape MxN, and B is the batch size.
 
-## Pre-requisite
+The outputs of the tuning process are 4 json files:
+1. gemm_type_selection.json: selects which kernel type is the best for each GEMMParam
+2. gemm_config_native.json: selects a list of best **GEMMConfigs** of the native kernel for each GEMMParam
+3. gemm_config_reshapedonlyrhs.json: selects a list of best GEMMConfigs of the reshaped_only_rhs kernel for each GEMMParam
+4. gemm_config_reshaped.json: selects a list of best GEMMConfigs of the reshaped kernel for each GEMMParam
+
+These 4 files are the current representations we use for what we call the **heuristics** of a GEMM op: given a GEMMParam,
+what kernel and subsequently what configurations for that kernels are the most performant.
+
+## Step-by-step example
+
+### Step1: Prepare the shape and configs files
+1. We first need to identify the shapes that we are interested in and store them in a csv file, say *gemm_shapes.csv*.
+2. Then we need to specify a set of good GEMMConfig candidates for each kernel in 3 separate csv files (this requires
+    some prior heuristics, but can be provided by the ACL developers upon requests, based on your target device).
+
+   Say we have *gemm_configs_native.csv", "gemm_configs_reshaped.csv" and "gemm_configs_reshaped_only_rhs.csv".
+
+   Please refer to the Prerequisite section for more details
+
+### Step2: Push relevant files to the target device
+All the files that need to be present on the target device are:
+* benchmark script: \<ACL\>/examples/gemm_tuner/benchmark_gemm_examples.sh
+* shapes and configs csv files: gemm_shapes.csv, gemm_configs_native.csv, gemm_configs_reshaped_only_rhs.csv, gemm_configs_reshaped.csv
+* Example benchmark binaries: \<ACL\>/build/tests/gemm_tuner/benchmark_cl_gemm*
+
+### Step3: Collect benchmark data
+With these files on device, we can collect benchmark data using the script. Assume all the example binaries are pushed
+to a folder called *gemm_tuner*. While logged onto our device:
+```
+# Native
+./benchmark_gemm_examples.sh -s native -e ./gemm_tuner -g ./gemm_shapes.csv -c ./gemm_configs_native.csv -o results/native
+# Reshaped Only RHS
+./benchmark_gemm_examples.sh -s reshaped_rhs_only -e ./gemm_tuner -g ./gemm_shapes.csv -c ./gemm_configs_reshaped_only_rhs.csv -o results/reshaped_only_rhs
+# Reshaped
+./benchmark_gemm_examples.sh -s reshaped -e ./gemm_tuner -g ./gemm_shapes.csv -c ./gemm_configs_reshaped.csv -o results/reshaped
+```
+You can repeat the 3 commands above to have a bit redundancy in your benchmark data (as you can imagine, measurement is noisy),
+but you may need to change the output folder for each repeat
+
+### Step4: Generate the heuristics
+1. After benchmarking, we pull the benchmark data, the *results* folder, from the target device to our host machine
+2. We use the GemmTuner.py script to give us the heuristics
+   ```
+   python3 <ACL>/examples/gemm_tuner/GemmTuner.py -b ./results -o heuristics
+   ```
+   When it's finished, there should be 4 json files in the *heuristics* folder
+
+One thing to notice is that the config heuristics might give more than 1 recommendations for each GEMMParam, because
+we accept all good GEMMConfigs with a tolerance. If you want fewer recommendations, you can decrease the tolerance by
+passing a lower value to *-t \<tolerance\>* to the GemmTuner.py script.
+
+## Prerequisite
 * A target device to be tuned, plus the following on the device:
     * Android or Linux OS
     * Bash shell
@@ -28,10 +86,7 @@
 
        The format is described as:
 
-       A headerless csv file with fields separated by commas and commas only (there cannot be whitespaces around each
-       field).
-
-       Note also comments and extraneous empty lines are not permitted.
+       A headerless csv file with fields separated by commas.
 
        A gemm shape is a list of 4 positive integers \<M, N, K, B\> describing the shapes of the two matrices (LHS and
        RHS) with:
@@ -54,10 +109,10 @@
 
       The format of the file for each strategy is the same:  
 
-      A headerless csv file with fields separated by commas and commas only (there cannot be whitespaces around each
-      field). Note also comments and extraneous empty lines are not permitted.
+      A headerless csv file with fields separated by commas.
 
       However the fields of GEMMConfig differ for each strategy:
+
       * Strategy **native**:
         A gemm config is a list of 3 positive integers \<m0, n0, k0\>, with:
 
@@ -78,9 +133,7 @@
   ...
   ```
       * Strategy **reshaped_rhs_only**:
-
-        A gemm config is a list of 4 positive integers \<m0, n0, k0, h0\> and 2 boolean values interleave_rhs and
-        transpose_rhs, with:
+        A gemm config is a list of 4 positive integers <m0, n0, k0, h0> and 3 boolean values:
 
         m0 - Number of rows processed by the matrix multiplication  
         n0 - Number of columns processed by the matrix multiplication  
@@ -88,6 +141,9 @@
         h0 - Number of horizontal blocks of size (k0xn0) stored on the same output row  
         interleave_rhs - Interleave rhs matrix (1) / Do not interleave rhs matrix (0)  
         transpose_rhs - Transpose rhs matrix (1) / Do not transpose rhs matrix (0)  
+        export_to_cl_image_rhs - Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0). Can only be true
+                                with certain combinations of the GEMMParams and other configs. Please refer to CLGEMMReshapeRHSMatrixKernel
+                                for more details
 
         Only the following configurations of M0, N0 and K0 are currently supported:
 
@@ -98,14 +154,12 @@
 
         An example gemm config file looks like:
   ```
-  4,4,4,1,1,1
-  4,4,4,3,1,0
+  4,4,4,1,1,1,0
+  4,4,4,3,1,0,1
   ...
   ```
       * Strategy **reshaped**:
-
-        A gemm config is a list of 5 positive integers \<m0, n0, k0, v0, h0\> and 3 boolean values interleave_lhs,
-        interleave_rhs and transpose_rhs, with:
+        A gemm config is a list of 5 positive integers <m0, n0, k0, v0, h0> and 4 boolean values:
 
         m0 - Number of rows processed by the matrix multiplication  
         n0 - Number of columns processed by the matrix multiplication  
@@ -114,29 +168,31 @@
         h0 - Number of horizontal blocks of size (k0xn0) stored on the same output row  
         interleave_lhs - Interleave lhs matrix (1) / Do not interleave lhs matrix (0)  
         interleave_rhs - Interleave rhs matrix (1) / Do not interleave rhs matrix (0)  
-        transpose_rhs - Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose
-        lhs matrix (0)  
+        transpose_rhs - Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)  
+        export_to_cl_image_rhs - Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0). Can only be true
+                                with certain combinations of the GEMMParams and other configs. Please refer to CLGEMMReshapeRHSMatrixKernel
+                                for more details
 
-        * If rhs matrix is transposed only the following configurations are currently supported:
+        If rhs matrix is transposed only the following configurations are currently supported:
 
-          M0 = 2, 3, 4, 5, 6, 7, 8  
-          N0 = 2, 3, 4, 8, 16  
-          K0 = 2, 3, 4, 8, 16  
-          V0 >= 1  
-          H0 >= 1  
+        M0 = 2, 3, 4, 5, 6, 7, 8  
+        N0 = 2, 3, 4, 8, 16  
+        K0 = 2, 3, 4, 8, 16  
+        V0 >= 1  
+        H0 >= 1  
 
-        * If lhs matrix is transposed only the following configurations are currently supported:
+        If lhs matrix is transposed only the following configurations are currently supported:
 
-          M0 = 2, 3, 4, 8  
-          N0 = 2, 3, 4, 8, 16  
-          K0 = 2, 3, 4, 8, 16  
-          V0 >= 1  
-          H0 >= 1  
+        M0 = 2, 3, 4, 8  
+        N0 = 2, 3, 4, 8, 16  
+        K0 = 2, 3, 4, 8, 16  
+        V0 >= 1  
+        H0 >= 1  
 
         An example gemm config file looks like:
   ```
-  4,4,4,1,3,1,1,1
-  4,4,4,3,3,1,1,0
+  4,4,4,1,3,1,1,1,0
+  4,4,4,3,3,1,1,0,1
   ...
   ```
 * A host machine, plus these on the machine:
@@ -144,45 +200,53 @@
     * GemmTuner.py script
 
 ## Usage
-The tuning stage consists of 2 steps:
+The usage of the 2 scripts:
 
-1. Run benchmarks:
+1. benchmark_gemm_examples.sh
 
    Run the shell script (**benchmark_gemm_examples.sh**) on your **target device**. Note that all the built benchmark
-   examples have to be present on your target device prior to running. The benchmark results will be saved to json
-   files in an output directory.
+   examples: build/tests/gemm_tuner/benchmark_cl_gemm*, have to be present on your target device prior to running.
+   The benchmark results will be saved to json files in an output directory.
    ```
    Usage: benchmark_gemm_examples.sh [-h] -s \<strategy\> -e \<example_binary_dir\> -g \<gemm_shape_file\>
-   -c \<gemm_config_file\> [-o \<out_dir\>]
+   -c \<gemm_config_file\> [-d \<data_type\>] [-o \<out_dir\>]
 
    Options:
            -h
-           Print help messages. If a strategy is specified with -s \<strategy\>, then only display messages relevant
-           to that strategy. Otherwise if no strategy is specified, display messages for all available strategies.
+           Print help messages. If a strategy is specified with -s <strategy>, then only display messages relevant to that
+           strategy. Otherwise if no strategy is specified, display messages for all available strategies.
 
-           -s \<strategy\>
+           -s <strategy>
            Strategy option.
-           Options: native reshaped_rhs_only reshaped.
+           Options: ${ALL_STRATEGY_OPTIONS[@]}.
 
-           -e \<example_binary_dir\>
+           -e <example_binary_dir>
            Path to directory that holds all example binaries
 
-           -g \<gemm_shape_file\>
+           -g <gemm_shape_file>
            Path to gemm shape csv file
 
-           -c \<gemm_config_file\>
+           -c <gemm_config_file>
            Path to gemm config csv file
 
-           -o \<out_dir\>
+           -d <data_type>
+           Data type option with which to run benchmark examples
+           Default: ${DEFAULT_DATA_TYPE}
+           Supported options:
+           Strategy            :    Data Types
+           Native              :    F32
+           Reshaped            :    F16, F32
+           Reshaped RHS Only   :    F16, F32
+
+           -o <out_dir>
            Path to output directory that holds output json files
-           Default: out
+           Default: ${DEFAULT_OUT_DIR}
    ```
-2. Run analyser:
+2. GemmTuner.py:
 
   Run the python script (**GemmTuner.py**) on your **host machine**.
   You'll need to transfer all the benchmark result json files generated from the previous step to your host machine
-  beforehand. The script will output the best configuration, along with some analysis statistics for each strategy, and
-  optionally save the parsed benchmark results into csv files (one for each strategy) for further analysis.
+  beforehand. The script will output the best kernel and gemm configurations for each gemm param in the 4 output json files
    ```
    Usage: GemmTuner.py [-h] -b PATH [-o PATH] [-t TOLERANCE] [-D]
 
@@ -194,40 +258,11 @@
                            result json files have a file extension of
                            'gemmtuner_benchmark'
      -o PATH, --output_dir PATH
-                           Path to directory that holds output csv files. One per
-                           strategy
+                           Path to directory that holds output json files.
      -t TOLERANCE, --tolerance TOLERANCE
                            For testing if two GEMMConfigs are equivalent in terms
                            of performance. The tolerance is OpenCL timer in
                            milliseconds. Recommended value: <= 0.1 ms
      -D, --debug           Enable script debugging output
 
-   ```
-
-## Approach
-
-This section gives a brief description and rationale of the approach adopted by the current version of GEMM Tuner.
-
-As explained in the Introduction section, the outputs of the tuner are 1 optimal GEMMConfig for each strategy.
-This is because we can only integrate 1 GEMMConfig for each strategy in ACL at compile time. In theory, however, the
-optimal GEMMConfig also depends on different parameters of GEMM (called GEMM Parameter or GEMMParam, e.g.: the shape
-of the operation); thus ideally, for each strategy, the optimal configurations should be a mapping from GEMMParam to
-GEMMConfig instead of a single GEMMConfig.
-
-To address this issue, we ensure the one single optimal GEMMConfig can generalise well to all potential GEMMParams
-(or at least the ones that we care about). The approach we adopt involves a preliminary stage where a collection of
-common GEMMParams (GEMM shapes from popular networks) are compiled. Then, to reduce the final tuning time, rather
-contradictorily, we spend a lot of time searching for near-optimal GEMMConfigs for each GEMMParam first, and then
-discard redundant GEMMParams which share similar optimal GEMMConfigs with others. The resultant list of GEMMParams is
-called a __GEMMParam search list__, as in these GEMMParams are typical enough to capture the space of GEMMParams that
-we care about.
-
-During this preliminary stage we also produce a list of good GEMMConfigs that can be used to search for the optimal one
-in the actual tuning stage. This, again, is to reduce the tuning time, and the resultant list is called a
-__GEMMConfig search list__.
-
-The GEMMParam search list and the GEMMConfig search list are investigated and prepared by the developers; the users of
-GEMM tuner need not worry about producing them, but they need to obtain them prior to running the tuner.
-
-Once these two lists (2 for each strategy, so 6 in total) are obtained, they can be fed to the tuner, to produce the
-optimal GEMMConfig(s).
\ No newline at end of file
+   ```
\ No newline at end of file

diff --git a/examples/gemm_tuner/benchmark_gemm_examples.sh b/examples/gemm_tuner/benchmark_gemm_examples.sh
old mode 100755
new mode 100644
index d6f41cc..b5628f7
--- a/examples/gemm_tuner/benchmark_gemm_examples.sh
+++ b/examples/gemm_tuner/benchmark_gemm_examples.sh

@@ -1,4 +1,4 @@
-# Copyright (c) 2019 ARM Limited.
+# Copyright (c) 2019 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -36,6 +36,9 @@
 EXAMPLE_BIN_RESHAPED_RHS_ONLY="benchmark_cl_gemm_reshaped_rhs_only"
 EXAMPLE_BIN_RESHAPED="benchmark_cl_gemm_reshaped"
 
+# Default data type
+DEFAULT_DATA_TYPE="F32"
+
 # Default output directory
 DEFAULT_OUT_DIR="out"
 
@@ -56,10 +59,7 @@
 function help_gemm_shape_file() {
   cat >&2 << EOF
 Gemm shape file:
-  Gemm shape file is a headerless csv file with fields separated by commas and commas only (there cannot be whitespaces
-  around each field).
-
-  Note also comments and extraneous empty lines are not permitted.
+  Gemm shape file is a headerless csv file with fields separated by commas
 
   A gemm shape is a list of 4 positive integers <M, N, K, B> describing the shapes of the two matrices (LHS and RHS)
   with:
@@ -88,10 +88,7 @@
 function help_gemm_config_file_native() {
   cat >&2 << EOF
 Gemm config file (Strategy native):
-  Gemm config file is a headerless csv file with fields separated by commas and commas only (there cannot be whitespaces
-  around each field).
-
-  Note also comments and extraneous empty lines are not permitted.
+  Gemm config file is a headerless csv file with fields separated by commas
 
   A gemm config is a list of 3 positive integers <m0, n0, k0>, with:
   m0 - Number of rows processed by the matrix multiplication
@@ -123,18 +120,20 @@
 function help_gemm_config_file_reshaped_rhs_only() {
   cat >&2 << EOF
 Gemm config file (Strategy reshaped_rhs_only):
-  Gemm config file is a headerless csv file with fields separated by commas and commas only (there cannot be whitespaces
-  around each field).
+  Gemm config file is a headerless csv file with fields separated by commas.
 
   Note also comments and extraneous empty lines are not permitted.
 
-  A gemm config is a list of 4 positive integers <m0, n0, k0, h0> and 2 boolean values interleave_rhs and transpose_rhs, with:
+  A gemm config is a list of 4 positive integers <m0, n0, k0, h0> and 3 boolean values:
   m0 - Number of rows processed by the matrix multiplication
   n0 - Number of columns processed by the matrix multiplication
   k0 - Number of partial accumulations performed by the matrix multiplication
   h0 - Number of horizontal blocks of size (k0xn0) stored on the same output row
   interleave_rhs - Interleave rhs matrix (1) / Do not interleave rhs matrix (0)
   transpose_rhs - Transpose rhs matrix (1) / Do not transpose rhs matrix (0)
+  export_to_cl_image_rhs - Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0). Can only be true
+                           with certain combinations of the GEMMParams and other configs. Please refer to CLGEMMReshapeRHSMatrixKernel
+                           for more details
 
   Only the following configurations of M0, N0 and K0 are currently supported:
   M0 = 1, 2, 3, 4, 5, 6, 7, 8
@@ -143,8 +142,8 @@
   H0 >= 1
 
   An example gemm config file looks like:
-  4,4,4,1,1,1
-  4,4,4,3,1,0
+  4,4,4,1,1,1,0
+  4,4,4,3,1,0,1
   ...
 
 EOF
@@ -162,12 +161,9 @@
 function help_gemm_config_file_reshaped() {
   cat >&2 << EOF
 Gemm config file (Strategy reshaped):
-  Gemm config file is a headerless csv file with fields separated by commas and commas only (there cannot be whitespaces
-  around each field).
+  Gemm config file is a headerless csv file with fields separated by commas
 
-  Note also comments and extraneous empty lines are not permitted.
-
-  A gemm config is a list of 5 positive integers <m0, n0, k0, v0, h0> and 3 boolean values interleave_lhs, interleave_rhs and transpose_rhs, with:
+  A gemm config is a list of 5 positive integers <m0, n0, k0, v0, h0> and 4 boolean values:
   m0 - Number of rows processed by the matrix multiplication
   n0 - Number of columns processed by the matrix multiplication
   k0 - Number of partial accumulations performed by the matrix multiplication
@@ -176,6 +172,9 @@
   interleave_lhs - Interleave lhs matrix (1) / Do not interleave lhs matrix (0)
   interleave_rhs - Interleave rhs matrix (1) / Do not interleave rhs matrix (0)
   transpose_rhs - Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)
+  export_to_cl_image_rhs - Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0). Can only be true
+                           with certain combinations of the GEMMParams and other configs. Please refer to CLGEMMReshapeRHSMatrixKernel
+                           for more details
 
   If rhs matrix is transposed only the following configurations are currently supported:
   M0 = 2, 3, 4, 5, 6, 7, 8
@@ -192,8 +191,8 @@
   H0 >= 1
 
   An example gemm config file looks like:
-  4,4,4,1,3,1,1,1
-  4,4,4,3,3,1,1,0
+  4,4,4,1,3,1,1,1,0
+  4,4,4,3,3,1,1,0,1
   ...
 
 EOF
@@ -213,7 +212,7 @@
 Run gemm examples of a selected strategy, over provided tunable configurationsa and gemm shapes.
 Save the benchmark results to json files in an output directory.
 
-Usage: ${CMD} [-h] -s <strategy> -e <example_binary_dir> -g <gemm_shape_file> -c <gemm_config_file> [-o <out_dir>]
+Usage: ${CMD} [-h] -s <strategy> -e <example_binary_dir> -g <gemm_shape_file> -c <gemm_config_file> [-d <data_type>] [-o <out_dir>]
 
 Options:
         -h
@@ -233,6 +232,15 @@
         -c <gemm_config_file>
         Path to gemm config csv file
 
+        -d <data_type>
+        Data type option with which to run benchmark examples
+        Default: ${DEFAULT_DATA_TYPE}
+        Supported options:
+        Strategy            :    Data Types
+        Native              :    F32
+        Reshaped            :    F16, F32
+        Reshaped RHS Only   :    F16, F32
+
         -o <out_dir>
         Path to output directory that holds output json files
         Default: ${DEFAULT_OUT_DIR}
@@ -333,8 +341,11 @@
   local total_num_experiment
   local num_params
   local num_configs
-  num_params=$( wc -l ${GEMM_SHAPES_FILE} | cut -d " " -f 1)
-  num_configs=$( wc -l ${GEMM_CONFIGS_FILE} | cut -d " " -f 1 )
+  local match_expression_shape="^([^,]*,){3}[^,]*$"
+  local match_expression_config="^(\s*[0-9]+\s*,)+\s*[0-9]\s*$"
+  # Don't count empty lines and lines starting with # (comments)
+  num_params=$( grep -E "$match_expression_shape" "${GEMM_SHAPES_FILE}" | wc -l  | cut -d " " -f 1)
+  num_configs=$( grep -E "$match_expression_config" "${GEMM_CONFIGS_FILE}" | wc -l  | cut -d " " -f 1)
   (( total_num_experiment=${num_params} * ${num_configs} ))
   # Time elapsed since the beginning in seconds
   local time_elapsed_s
@@ -346,19 +357,22 @@
   do
     while read gemm_config
     do
-      echo "Running..." 1>&2
-      example_args="${gemm_shape},${gemm_config}"
-      # Run experiment
-      ${EXAMPLE_BIN_DIR}/${example_bin} --example_args=${example_args} --iterations=${NUM_ITERATION} --json-file=${OUT_DIR}/${expr_count}.${OUT_EXTENSION} --instruments=OPENCL_TIMER_MS
-      # Print progress
-      print_progress ${expr_count} ${total_num_experiment}
-      # Print time statistics
-      time_elapsed_s=$SECONDS
-      echo "Time elapsed since beginning: $(( $time_elapsed_s / 60 ))m $(( $time_elapsed_s % 60 ))s" 1>&2
-      (( time_est_s=(${total_num_experiment} - ${expr_count}) * ${time_elapsed_s} / ${expr_count} ))
-      echo "Time estimated to finish: $(( $time_est_s / 60 ))m $(( $time_est_s % 60 ))s" 1>&2
-      (( expr_count++ ))
-      echo "Done." 1>&2
+      # Ignore empty lines and lines starting with # (comments)
+      if echo "$gemm_shape" | grep -Eq "$match_expression_shape" && echo "$gemm_config" | grep -Eq "$match_expression_config";then
+        echo "Running..." 1>&2
+        example_args="${gemm_shape},${gemm_config},--type=${DATA_TYPE}"
+        # Run experiment
+        ${EXAMPLE_BIN_DIR}/${example_bin} --example_args=${example_args} --iterations=${NUM_ITERATION} --json-file=${OUT_DIR}/${expr_count}.${OUT_EXTENSION} --instruments=OPENCL_TIMER_MS
+        # Print progress
+        print_progress ${expr_count} ${total_num_experiment}
+        # Print time statistics
+        time_elapsed_s=$SECONDS
+        echo "Time elapsed since beginning: $(( $time_elapsed_s / 60 ))m $(( $time_elapsed_s % 60 ))s" 1>&2
+        (( time_est_s=(${total_num_experiment} - ${expr_count}) * ${time_elapsed_s} / ${expr_count} ))
+        echo "Time estimated to finish: $(( $time_est_s / 60 ))m $(( $time_est_s % 60 ))s" 1>&2
+        (( expr_count++ ))
+        echo "Done." 1>&2
+      fi
     done < "${GEMM_CONFIGS_FILE}"
   done < "${GEMM_SHAPES_FILE}"
   echo "Finished running all configs for ${example_bin}" 1>&2
@@ -405,6 +419,8 @@
 # Path to gemm configs file
 GEMM_CONFIGS_FILE=""
 STRATEGY_OPTION=""
+# Data type to use
+DATA_TYPE=${DEFAULT_DATA_TYPE}
 # Path to output directory
 OUT_DIR=${DEFAULT_OUT_DIR}
 # Output benchmark result file extension
@@ -413,13 +429,14 @@
 HELP=false
 
 # Obtain options
-while getopts "hs:e:g:c:o:" opt; do
+while getopts "hs:e:g:c:d:o:" opt; do
   case "$opt" in
     h) HELP=true ;;
     s) STRATEGY_OPTION=$(to_lower "${OPTARG}");;
     e) EXAMPLE_BIN_DIR="${OPTARG}";;
     g) GEMM_SHAPES_FILE="${OPTARG}";;
     c) GEMM_CONFIGS_FILE="${OPTARG}";;
+    d) DATA_TYPE="${OPTARG}";;
     o) OUT_DIR="${OPTARG}";;
   esac
 done
@@ -459,7 +476,9 @@
   error_msg "Output directory ${OUT_DIR} already exists!"
 
 # Make output directory
-mkdir ${OUT_DIR}
+echo "Making output directory ${OUT_DIR}" 1>&2
+mkdir -p ${OUT_DIR} || error_msg "Failed to make output directory ${OUT_DIR}"
+date +%s > ${OUT_DIR}/start_time_unix_seconds
 
 # Run selected strategy with all configurations
 # Restart the built-in timer
@@ -467,4 +486,6 @@
 [ "${STRATEGY_OPTION}" == "native" ] && run $EXAMPLE_BIN_NATIVE
 [ "${STRATEGY_OPTION}" == "reshaped_rhs_only" ] && run $EXAMPLE_BIN_RESHAPED_RHS_ONLY
 [ "${STRATEGY_OPTION}" == "reshaped" ] && run $EXAMPLE_BIN_RESHAPED
+
+date +%s > ${OUT_DIR}/end_time_unix_seconds
 # Main: Main script }}}

diff --git a/examples/gemm_tuner/cl_gemm_native.cpp b/examples/gemm_tuner/cl_gemm_native.cpp
index 0cacd82..4303508 100644
--- a/examples/gemm_tuner/cl_gemm_native.cpp
+++ b/examples/gemm_tuner/cl_gemm_native.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -132,10 +132,9 @@
     bool do_setup(int argc, char **argv) override
     {
         // Default parameters
-        const DataType            data_type = DataType::F32;
-        const float               alpha     = 1.0f;
-        const float               beta      = 0.0f;
-        const ActivationLayerInfo act_info  = ActivationLayerInfo();
+        const float               alpha    = 1.0f;
+        const float               beta     = 0.0f;
+        const ActivationLayerInfo act_info = ActivationLayerInfo();
         CommonGemmExampleParams   params;
         GemmConfigs               configs;
 
@@ -167,16 +166,16 @@
         }
 
         // Print gemm parameters and configurations
-        std::cerr << "Gemm parameters:" << std::endl;
-        std::cerr << params << std::endl;
-        std::cerr << "Gemm configurations:" << std::endl;
-        std::cerr << configs << std::endl;
+        std::cout << "Gemm parameters:" << std::endl;
+        std::cout << params << std::endl;
+        std::cout << "Gemm configurations:" << std::endl;
+        std::cout << configs << std::endl;
 
         CLScheduler::get().default_init(&tuner);
 
-        lhs.allocator()->init(TensorInfo(TensorShape(params.K, params.M, params.B), 1, data_type));
-        rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, data_type));
-        bias.allocator()->init(TensorInfo(TensorShape(params.N, 1, params.B), 1, data_type));
+        lhs.allocator()->init(TensorInfo(TensorShape(params.K, params.M, params.B), 1, params.data_type));
+        rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, params.data_type));
+        bias.allocator()->init(TensorInfo(TensorShape(params.N, 1, params.B), 1, params.data_type));
 
         GEMMLHSMatrixInfo lhs_info;
         lhs_info.m0 = configs.m0;
@@ -195,6 +194,17 @@
         kernel_info.broadcast_bias          = true;
         kernel_info.activation_info         = act_info;
 
+        // Validate argments
+        Status status{};
+        status = gemm.validate((&lhs)->info(), (&rhs)->info(), (&bias)->info(), (&dst)->info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+        if(!status)
+        {
+            // Unsupported arguments
+            std::cerr << "Unsupported arguments." << std::endl;
+            std::cerr << "Check documentation for supported/unsupported combinations" << std::endl;
+            return false;
+        }
+
         // Configure function
         gemm.configure(&lhs, &rhs, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
 

diff --git a/examples/gemm_tuner/cl_gemm_reshaped.cpp b/examples/gemm_tuner/cl_gemm_reshaped.cpp
index e579ed7..9c6568c 100644
--- a/examples/gemm_tuner/cl_gemm_reshaped.cpp
+++ b/examples/gemm_tuner/cl_gemm_reshaped.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #endif /* ARM_COMPUTE_CL */
 
 #include "CommonGemmExampleOptions.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
 #include "arm_compute/core/Helpers.h"
@@ -52,15 +53,16 @@
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 };                /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 };                /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 };                /**< Number of partial accumulations performed by the matrix multiplication */
-    size_t v0{ 1 };                /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    size_t h0{ 1 };                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool   interleave_lhs{ true }; /**< Interleave lhs matrix */
-    bool   transpose_lhs{ true };  /**< Transpose lhs matrix. */
-    bool   interleave_rhs{ true }; /**< Interleave rhs matrix */
-    bool   transpose_rhs{ true };  /**< Transpose rhs matrix. */
+    size_t m0{ 4 };                        /**< Number of rows processed by the matrix multiplication */
+    size_t n0{ 4 };                        /**< Number of columns processed by the matrix multiplication */
+    size_t k0{ 4 };                        /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t v0{ 1 };                        /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    size_t h0{ 1 };                        /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool   interleave_lhs{ true };         /**< Interleave lhs matrix */
+    bool   transpose_lhs{ true };          /**< Transpose lhs matrix. */
+    bool   interleave_rhs{ true };         /**< Interleave rhs matrix */
+    bool   transpose_rhs{ true };          /**< Transpose rhs matrix. */
+    bool   export_to_cl_image_rhs{ true }; /**< Export rhs matrix to cl_image. */
 };
 
 /** Formatted output of the GemmConfigs type
@@ -84,6 +86,7 @@
     os << "transpose_lhs : " << (configs.transpose_lhs ? true_str : false_str) << std::endl;
     os << "interleave_rhs : " << (configs.interleave_rhs ? true_str : false_str) << std::endl;
     os << "transpose_rhs : " << (configs.transpose_rhs ? true_str : false_str) << std::endl;
+    os << "export_to_cl_image_rhs : " << (configs.export_to_cl_image_rhs ? true_str : false_str) << std::endl;
     return os;
 }
 
@@ -103,7 +106,8 @@
           h0(parser.add_positional_option<SimpleOption<size_t>>("h0", 1)),
           interleave_lhs(parser.add_positional_option<SimpleOption<size_t>>("interleave_lhs", 1)),
           interleave_rhs(parser.add_positional_option<SimpleOption<size_t>>("interleave_rhs", 1)),
-          transpose_rhs(parser.add_positional_option<SimpleOption<size_t>>("transpose_rhs", 1))
+          transpose_rhs(parser.add_positional_option<SimpleOption<size_t>>("transpose_rhs", 1)),
+          export_to_cl_image_rhs(parser.add_positional_option<SimpleOption<size_t>>("export_to_cl_image_rhs", 1))
     {
         m0->set_help("Number of rows processed by the matrix multiplication");
         n0->set_help("Number of columns processed by the matrix multiplication");
@@ -116,6 +120,7 @@
         // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
         // 2 variants (both transposed and none transposed)
         transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)");
+        export_to_cl_image_rhs->set_help("Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)");
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     GemmConfigOptions(const GemmConfigOptions &) = delete;
@@ -138,7 +143,8 @@
     // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and
     // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
     // 2 variants (both transposed and none transposed)
-    SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */
+    SimpleOption<size_t> *transpose_rhs;          /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */
+    SimpleOption<size_t> *export_to_cl_image_rhs; /**< Export rhs matrix to cl_image.*/
 };
 
 /** Consumes the gemm configuration options and creates a structure containing all information
@@ -159,9 +165,10 @@
     // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and
     // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other
     // 2 variants (both transposed and none transposed)
-    configs.transpose_lhs  = options.transpose_rhs->value() == 0;
-    configs.interleave_rhs = options.interleave_rhs->value() != 0;
-    configs.transpose_rhs  = options.transpose_rhs->value() != 0;
+    configs.transpose_lhs          = options.transpose_rhs->value() == 0;
+    configs.interleave_rhs         = options.interleave_rhs->value() != 0;
+    configs.transpose_rhs          = options.transpose_rhs->value() != 0;
+    configs.export_to_cl_image_rhs = options.export_to_cl_image_rhs->value() != 0;
     return configs;
 }
 
@@ -177,10 +184,9 @@
     bool do_setup(int argc, char **argv) override
     {
         // Default parameters
-        const DataType            data_type = DataType::F32;
-        const float               alpha     = 1.0f;
-        const float               beta      = 0.0f;
-        const ActivationLayerInfo act_info  = ActivationLayerInfo();
+        const float               alpha    = 1.0f;
+        const float               beta     = 0.0f;
+        const ActivationLayerInfo act_info = ActivationLayerInfo();
         CommonGemmExampleParams   params;
         GemmConfigs               configs;
 
@@ -212,16 +218,16 @@
         }
 
         // Print gemm parameters and configurations
-        std::cerr << "Gemm parameters:" << std::endl;
-        std::cerr << params << std::endl;
-        std::cerr << "Gemm configurations:" << std::endl;
-        std::cerr << configs << std::endl;
+        std::cout << "Gemm parameters:" << std::endl;
+        std::cout << params << std::endl;
+        std::cout << "Gemm configurations:" << std::endl;
+        std::cout << configs << std::endl;
 
         CLScheduler::get().default_init(&tuner);
 
-        lhs.allocator()->init(TensorInfo(TensorShape(params.K, params.M, params.B), 1, data_type));
-        rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, data_type));
-        bias.allocator()->init(TensorInfo(TensorShape(params.N, 1, params.B), 1, data_type));
+        lhs.allocator()->init(TensorInfo(TensorShape(params.K, params.M, params.B), 1, params.data_type));
+        rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, params.data_type));
+        bias.allocator()->init(TensorInfo(TensorShape(params.N, 1, params.B), 1, params.data_type));
 
         GEMMLHSMatrixInfo lhs_info;
         lhs_info.m0         = configs.m0;
@@ -231,11 +237,12 @@
         lhs_info.transpose  = configs.transpose_lhs;
 
         GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0         = configs.n0;
-        rhs_info.k0         = configs.k0;
-        rhs_info.h0         = configs.h0;
-        rhs_info.interleave = configs.interleave_rhs;
-        rhs_info.transpose  = configs.transpose_rhs;
+        rhs_info.n0                 = configs.n0;
+        rhs_info.k0                 = configs.k0;
+        rhs_info.h0                 = configs.h0;
+        rhs_info.interleave         = configs.interleave_rhs;
+        rhs_info.transpose          = configs.transpose_rhs;
+        rhs_info.export_to_cl_image = configs.export_to_cl_image_rhs;
 
         GEMMKernelInfo kernel_info;
         kernel_info.m                       = params.M;
@@ -252,6 +259,31 @@
         // Initialise rhs_reshaped tensor info
         auto_init_if_empty(*rhs_reshaped.info(), rhs.info()->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*rhs.info(), rhs_info)));
 
+        if(rhs_info.export_to_cl_image)
+        {
+            arm_compute::cl_gemm::update_padding_for_cl_image(rhs_reshaped.info());
+        }
+
+        // Validate argments
+        Status status{};
+        status = reshape_lhs.validate((&lhs)->info(), (&lhs_reshaped)->info(), lhs_info, kernel_info.reinterpret_input_as_3d);
+        if(!status)
+        {
+            // Unsupported arguments
+            std::cerr << "Unsupported arguments." << std::endl;
+            std::cerr << "Check documentation for supported/unsupported combinations" << std::endl;
+            return false;
+        }
+
+        status = gemm.validate((&lhs_reshaped)->info(), (&rhs_reshaped)->info(), (&bias)->info(), (&dst)->info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+        if(!status)
+        {
+            // Unsupported arguments
+            std::cerr << "Unsupported arguments." << std::endl;
+            std::cerr << "Check documentation for supported/unsupported combinations" << std::endl;
+            return false;
+        }
+
         // Configure reshape lhs function
         reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info);
 

diff --git a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
index 0d161aa..f814c54 100644
--- a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp
+++ b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #endif /* ARM_COMPUTE_CL */
 
 #include "CommonGemmExampleOptions.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
@@ -51,12 +52,13 @@
 /** Structure holding all tunable gemm configs specific to this example/strategy */
 struct GemmConfigs
 {
-    size_t m0{ 4 };                /**< Number of rows processed by the matrix multiplication */
-    size_t n0{ 4 };                /**< Number of columns processed by the matrix multiplication */
-    size_t k0{ 4 };                /**< Number of partial accumulations performed by the matrix multiplication */
-    size_t h0{ 1 };                /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool   interleave_rhs{ true }; /**< Interleave rhs matrix */
-    bool   transpose_rhs{ true };  /**< Transpose rhs matrix */
+    size_t m0{ 4 };                        /**< Number of rows processed by the matrix multiplication */
+    size_t n0{ 4 };                        /**< Number of columns processed by the matrix multiplication */
+    size_t k0{ 4 };                        /**< Number of partial accumulations performed by the matrix multiplication */
+    size_t h0{ 1 };                        /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool   interleave_rhs{ true };         /**< Interleave rhs matrix */
+    bool   transpose_rhs{ true };          /**< Transpose rhs matrix */
+    bool   export_to_cl_image_rhs{ true }; /**< Export rhs matrix to cl_image.*/
 };
 
 /** Formatted output of the GemmConfigs type
@@ -77,6 +79,7 @@
     os << "h0 : " << configs.h0 << std::endl;
     os << "interleave_rhs : " << (configs.interleave_rhs ? true_str : false_str) << std::endl;
     os << "transpose_rhs : " << (configs.transpose_rhs ? true_str : false_str) << std::endl;
+    os << "export_to_cl_image_rhs : " << (configs.export_to_cl_image_rhs ? true_str : false_str) << std::endl;
     return os;
 }
 
@@ -94,7 +97,8 @@
           k0(parser.add_positional_option<SimpleOption<size_t>>("k0", 4)),
           h0(parser.add_positional_option<SimpleOption<size_t>>("h0", 1)),
           interleave_rhs(parser.add_positional_option<SimpleOption<size_t>>("interleave_rhs", 1)),
-          transpose_rhs(parser.add_positional_option<SimpleOption<size_t>>("transpose_rhs", 1))
+          transpose_rhs(parser.add_positional_option<SimpleOption<size_t>>("transpose_rhs", 1)),
+          export_to_cl_image_rhs(parser.add_positional_option<SimpleOption<size_t>>("export_to_cl_image_rhs", 1))
     {
         m0->set_help("Number of rows processed by the matrix multiplication");
         n0->set_help("Number of columns processed by the matrix multiplication");
@@ -102,6 +106,7 @@
         h0->set_help("Number of horizontal blocks of size (k0xn0) stored on the same output row");
         interleave_rhs->set_help("Interleave rhs matrix (1) / Do not interleave rhs matrix (0)");
         transpose_rhs->set_help("Transpose rhs matrix (1) / Do not transpose rhs matrix (0)");
+        export_to_cl_image_rhs->set_help("Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)");
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     GemmConfigOptions(const GemmConfigOptions &) = delete;
@@ -114,12 +119,13 @@
     /** Default destructor */
     ~GemmConfigOptions() = default;
 
-    SimpleOption<size_t> *m0;             /**< Number of rows processed by the matrix multiplication option */
-    SimpleOption<size_t> *n0;             /**< Number of columns processed by the matrix multiplication option */
-    SimpleOption<size_t> *k0;             /**< Number of partial accumulations performed by the matrix multiplication option */
-    SimpleOption<size_t> *h0;             /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
-    SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */
-    SimpleOption<size_t> *transpose_rhs;  /**< Transpose rhs matrix option (1 enable; 0 disable) */
+    SimpleOption<size_t> *m0;                     /**< Number of rows processed by the matrix multiplication option */
+    SimpleOption<size_t> *n0;                     /**< Number of columns processed by the matrix multiplication option */
+    SimpleOption<size_t> *k0;                     /**< Number of partial accumulations performed by the matrix multiplication option */
+    SimpleOption<size_t> *h0;                     /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */
+    SimpleOption<size_t> *interleave_rhs;         /**< Interleave rhs matrix option (1 enable; 0 disable) */
+    SimpleOption<size_t> *transpose_rhs;          /**< Transpose rhs matrix option (1 enable; 0 disable) */
+    SimpleOption<size_t> *export_to_cl_image_rhs; /**< Export rhs matrix to cl_image.*/
 };
 
 /** Consumes the gemm configuration options and creates a structure containing all information
@@ -131,12 +137,13 @@
 GemmConfigs consume_gemm_configs(const GemmConfigOptions &options)
 {
     GemmConfigs configs;
-    configs.m0             = options.m0->value();
-    configs.n0             = options.n0->value();
-    configs.k0             = options.k0->value();
-    configs.h0             = options.h0->value();
-    configs.interleave_rhs = options.interleave_rhs->value() != 0;
-    configs.transpose_rhs  = options.transpose_rhs->value() != 0;
+    configs.m0                     = options.m0->value();
+    configs.n0                     = options.n0->value();
+    configs.k0                     = options.k0->value();
+    configs.h0                     = options.h0->value();
+    configs.interleave_rhs         = options.interleave_rhs->value() != 0;
+    configs.transpose_rhs          = options.transpose_rhs->value() != 0;
+    configs.export_to_cl_image_rhs = options.export_to_cl_image_rhs->value() != 0;
     return configs;
 }
 
@@ -150,10 +157,9 @@
     bool do_setup(int argc, char **argv) override
     {
         // Default parameters
-        const DataType            data_type = DataType::F32;
-        const float               alpha     = 1.0f;
-        const float               beta      = 0.0f;
-        const ActivationLayerInfo act_info  = ActivationLayerInfo();
+        const float               alpha    = 1.0f;
+        const float               beta     = 0.0f;
+        const ActivationLayerInfo act_info = ActivationLayerInfo();
         CommonGemmExampleParams   params;
         GemmConfigs               configs;
 
@@ -185,27 +191,28 @@
         }
 
         // Print gemm parameters and configurations
-        std::cerr << "Gemm parameters:" << std::endl;
-        std::cerr << params << std::endl;
-        std::cerr << "Gemm configurations:" << std::endl;
-        std::cerr << configs << std::endl;
+        std::cout << "Gemm parameters:" << std::endl;
+        std::cout << params << std::endl;
+        std::cout << "Gemm configurations:" << std::endl;
+        std::cout << configs << std::endl;
 
         CLScheduler::get().default_init(&tuner);
 
-        lhs.allocator()->init(TensorInfo(TensorShape(params.K, params.M, params.B), 1, data_type));
-        rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, data_type));
-        bias.allocator()->init(TensorInfo(TensorShape(params.N, 1, params.B), 1, data_type));
+        lhs.allocator()->init(TensorInfo(TensorShape(params.K, params.M, params.B), 1, params.data_type));
+        rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, params.data_type));
+        bias.allocator()->init(TensorInfo(TensorShape(params.N, 1, params.B), 1, params.data_type));
 
         GEMMLHSMatrixInfo lhs_info;
         lhs_info.m0 = configs.m0;
         lhs_info.k0 = configs.k0;
 
         GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0         = configs.n0;
-        rhs_info.k0         = configs.k0;
-        rhs_info.h0         = configs.h0;
-        rhs_info.interleave = configs.interleave_rhs;
-        rhs_info.transpose  = configs.transpose_rhs;
+        rhs_info.n0                 = configs.n0;
+        rhs_info.k0                 = configs.k0;
+        rhs_info.h0                 = configs.h0;
+        rhs_info.interleave         = configs.interleave_rhs;
+        rhs_info.transpose          = configs.transpose_rhs;
+        rhs_info.export_to_cl_image = configs.export_to_cl_image_rhs;
 
         GEMMKernelInfo kernel_info;
         kernel_info.m                       = params.M;
@@ -219,6 +226,22 @@
         // Initialise rhs_reshaped tensor info
         auto_init_if_empty(*rhs_reshaped.info(), rhs.info()->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*rhs.info(), rhs_info)));
 
+        if(rhs_info.export_to_cl_image)
+        {
+            arm_compute::cl_gemm::update_padding_for_cl_image(rhs_reshaped.info());
+        }
+
+        // Validate argments
+        Status status{};
+        status = gemm.validate((&lhs)->info(), (&rhs_reshaped)->info(), (&bias)->info(), (&dst)->info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+        if(!status)
+        {
+            // Unsupported arguments
+            std::cerr << "Unsupported arguments." << std::endl;
+            std::cerr << "Check documentation for supported/unsupported combinations" << std::endl;
+            return false;
+        }
+
         // Configure function
         gemm.configure(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
 

diff --git a/examples/graph_alexnet.cpp b/examples/graph_alexnet.cpp
index 25ede6d..40bbee1 100644
--- a/examples/graph_alexnet.cpp
+++ b/examples/graph_alexnet.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_deepspeech_v0_4_1.cpp b/examples/graph_deepspeech_v0_4_1.cpp
index b655452..4a8a8b1 100644
--- a/examples/graph_deepspeech_v0_4_1.cpp
+++ b/examples/graph_deepspeech_v0_4_1.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_edsr.cpp b/examples/graph_edsr.cpp
index 2f2a9fa..3868f0f 100644
--- a/examples/graph_edsr.cpp
+++ b/examples/graph_edsr.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_edsr.h b/examples/graph_edsr.h
index cb467d0..42a2789 100644
--- a/examples/graph_edsr.h
+++ b/examples/graph_edsr.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_googlenet.cpp b/examples/graph_googlenet.cpp
index 84a10ff..ed5cbd5 100644
--- a/examples/graph_googlenet.cpp
+++ b/examples/graph_googlenet.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_inception_resnet_v1.cpp b/examples/graph_inception_resnet_v1.cpp
index ea9bf8f..7c0bb0c 100644
--- a/examples/graph_inception_resnet_v1.cpp
+++ b/examples/graph_inception_resnet_v1.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_inception_resnet_v2.cpp b/examples/graph_inception_resnet_v2.cpp
index d2f6e1d..d14c34e 100644
--- a/examples/graph_inception_resnet_v2.cpp
+++ b/examples/graph_inception_resnet_v2.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_inception_v3.cpp b/examples/graph_inception_v3.cpp
index 03d5dff..4b6dc8d 100644
--- a/examples/graph_inception_v3.cpp
+++ b/examples/graph_inception_v3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_inception_v4.cpp b/examples/graph_inception_v4.cpp
index c78bbb2..553c96d 100644
--- a/examples/graph_inception_v4.cpp
+++ b/examples/graph_inception_v4.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_lenet.cpp b/examples/graph_lenet.cpp
index 7b475c2..e578307 100644
--- a/examples/graph_lenet.cpp
+++ b/examples/graph_lenet.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_mnist.cpp b/examples/graph_mnist.cpp
index 56d5c96..85ab0ab 100644
--- a/examples/graph_mnist.cpp
+++ b/examples/graph_mnist.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_mobilenet.cpp b/examples/graph_mobilenet.cpp
index bb89399..f74d251 100644
--- a/examples/graph_mobilenet.cpp
+++ b/examples/graph_mobilenet.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_mobilenet_v2.cpp b/examples/graph_mobilenet_v2.cpp
index 0d6b471..5ee1f7e 100644
--- a/examples/graph_mobilenet_v2.cpp
+++ b/examples/graph_mobilenet_v2.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_resnet12.cpp b/examples/graph_resnet12.cpp
index 120cc9b..badcaec 100644
--- a/examples/graph_resnet12.cpp
+++ b/examples/graph_resnet12.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_resnet50.cpp b/examples/graph_resnet50.cpp
index 7af058e..2939ee4 100644
--- a/examples/graph_resnet50.cpp
+++ b/examples/graph_resnet50.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_resnet_v2_50.cpp b/examples/graph_resnet_v2_50.cpp
index 7d6b9aa..32434f5 100644
--- a/examples/graph_resnet_v2_50.cpp
+++ b/examples/graph_resnet_v2_50.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_resnext50.cpp b/examples/graph_resnext50.cpp
index 2c50594..1d9ed8d 100644
--- a/examples/graph_resnext50.cpp
+++ b/examples/graph_resnext50.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_shufflenet.cpp b/examples/graph_shufflenet.cpp
index 0b97798..08f884b 100644
--- a/examples/graph_shufflenet.cpp
+++ b/examples/graph_shufflenet.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_squeezenet.cpp b/examples/graph_squeezenet.cpp
index 35fceb4..f0d620c 100644
--- a/examples/graph_squeezenet.cpp
+++ b/examples/graph_squeezenet.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_squeezenet_v1_1.cpp b/examples/graph_squeezenet_v1_1.cpp
index f648b63..c604486 100644
--- a/examples/graph_squeezenet_v1_1.cpp
+++ b/examples/graph_squeezenet_v1_1.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_srcnn955.cpp b/examples/graph_srcnn955.cpp
index 1892106..a95f0c1 100644
--- a/examples/graph_srcnn955.cpp
+++ b/examples/graph_srcnn955.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_ssd_mobilenet.cpp b/examples/graph_ssd_mobilenet.cpp
index f2a8b30..edd4c94 100644
--- a/examples/graph_ssd_mobilenet.cpp
+++ b/examples/graph_ssd_mobilenet.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_vgg16.cpp b/examples/graph_vgg16.cpp
index f6996da..990040b 100644
--- a/examples/graph_vgg16.cpp
+++ b/examples/graph_vgg16.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,16 +41,6 @@
     }
     bool do_setup(int argc, char **argv) override
     {
-        // Check if the system has enough RAM to run the example, systems with less than 2GB have
-        // to hint the API to minimize memory consumption otherwise it'll run out of memory and
-        // fail throwing the bad_alloc exception
-        arm_compute::MEMInfo meminfo;
-        const size_t         mem_total = meminfo.get_total_in_kb();
-        if(mem_total <= arm_compute::MEMInfo::TWO_GB_IN_KB)
-        {
-            arm_compute::MEMInfo::set_policy(arm_compute::MemoryPolicy::MINIMIZE);
-        }
-
         // Parse arguments
         cmd_parser.parse(argc, argv);
         cmd_parser.validate();

diff --git a/examples/graph_vgg19.cpp b/examples/graph_vgg19.cpp
index f9f5c21..9215ba7 100644
--- a/examples/graph_vgg19.cpp
+++ b/examples/graph_vgg19.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,16 +40,6 @@
     }
     bool do_setup(int argc, char **argv) override
     {
-        // Check if the system has enough RAM to run the example, systems with less than 2GB have
-        // to hint the API to minimize memory consumption otherwise it'll run out of memory and
-        // fail throwing the bad_alloc exception
-        arm_compute::MEMInfo meminfo;
-        const size_t         mem_total = meminfo.get_total_in_kb();
-        if(mem_total <= arm_compute::MEMInfo::TWO_GB_IN_KB)
-        {
-            arm_compute::MEMInfo::set_policy(arm_compute::MemoryPolicy::MINIMIZE);
-        }
-
         // Parse arguments
         cmd_parser.parse(argc, argv);
         cmd_parser.validate();

diff --git a/examples/graph_vgg_vdsr.cpp b/examples/graph_vgg_vdsr.cpp
index c308236..65c0642 100644
--- a/examples/graph_vgg_vdsr.cpp
+++ b/examples/graph_vgg_vdsr.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_yolov3.cpp b/examples/graph_yolov3.cpp
index bbc6b72..c7f917b 100644
--- a/examples/graph_yolov3.cpp
+++ b/examples/graph_yolov3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/graph_yolov3_output_detector.cpp b/examples/graph_yolov3_output_detector.cpp
new file mode 100644
index 0000000..6278565
--- /dev/null
+++ b/examples/graph_yolov3_output_detector.cpp

@@ -0,0 +1,626 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/graph.h"
+#include "arm_compute/graph/Utils.h"
+
+#include "support/ToolchainSupport.h"
+#include "utils/CommonGraphOptions.h"
+#include "utils/GraphUtils.h"
+#include "utils/Utils.h"
+
+using namespace arm_compute::graph;
+using namespace arm_compute::utils;
+
+class GraphYoloV3OutputDetector
+{
+public:
+    GraphYoloV3OutputDetector()
+        : _graph(0, "GraphYoloV3OutputDetector")
+    {
+    }
+
+    bool setup(const CommonGraphParams &common_params, const SimpleOption<std::string> &expected_output_filename)
+    {
+        using namespace arm_compute;
+        using namespace graph_utils;
+
+        const DataLayout  data_layout = common_params.data_layout;
+        const std::string data_path   = common_params.data_path;
+        const Target      target      = common_params.target;
+
+        const DataLayoutDimension x_dim = (data_layout == DataLayout::NHWC) ? DataLayoutDimension::CHANNEL : DataLayoutDimension::WIDTH;
+        const DataLayoutDimension y_dim = (data_layout == DataLayout::NHWC) ? DataLayoutDimension::WIDTH : DataLayoutDimension::HEIGHT;
+
+        NodeID id_ConstantFolding_truediv_1_recip = _graph.add_node<ConstNode>(
+                                                        TensorDescriptor
+        {
+            TensorShape{ 1, 1, 1 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_ConstantFolding_truediv_1_recip = _graph.node(id_ConstantFolding_truediv_1_recip);
+        node_ConstantFolding_truediv_1_recip->set_common_node_parameters(NodeParams{ "ConstantFolding_truediv_1_recip", target });
+        node_ConstantFolding_truediv_1_recip->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/ConstantFolding_truediv_1_recip.npy", data_layout));
+
+        NodeID id_ConstantFolding_truediv_recip = _graph.add_node<ConstNode>(
+                                                      TensorDescriptor
+        {
+            TensorShape{ 1, 1, 1 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_ConstantFolding_truediv_recip = _graph.node(id_ConstantFolding_truediv_recip);
+        node_ConstantFolding_truediv_recip->set_common_node_parameters(NodeParams{ "ConstantFolding_truediv_recip", target });
+        node_ConstantFolding_truediv_recip->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/ConstantFolding_truediv_recip.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_mul_6_y = _graph.add_node<ConstNode>(
+                                                 TensorDescriptor
+        {
+            TensorShape{ 2 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_detector_yolo_v3_mul_6_y = _graph.node(id_detector_yolo_v3_mul_6_y);
+        node_detector_yolo_v3_mul_6_y->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_6_y", target });
+        node_detector_yolo_v3_mul_6_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_6_y.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_mul_3_y = _graph.add_node<ConstNode>(
+                                                 TensorDescriptor
+        {
+            TensorShape{ 2 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_detector_yolo_v3_mul_3_y = _graph.node(id_detector_yolo_v3_mul_3_y);
+        node_detector_yolo_v3_mul_3_y->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_3_y", target });
+        node_detector_yolo_v3_mul_3_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_3_y.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_mul_y = _graph.add_node<ConstNode>(
+                                               TensorDescriptor
+        {
+            TensorShape{ 2 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_detector_yolo_v3_mul_y = _graph.node(id_detector_yolo_v3_mul_y);
+        node_detector_yolo_v3_mul_y->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_y", target });
+        node_detector_yolo_v3_mul_y->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_y.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_mul_7 = _graph.add_node<ConstNode>(
+                                               TensorDescriptor
+        {
+            TensorShape{ 2, 8112 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_detector_yolo_v3_mul_7 = _graph.node(id_detector_yolo_v3_mul_7);
+        node_detector_yolo_v3_mul_7->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_7", target });
+        node_detector_yolo_v3_mul_7->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_7.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_Reshape_11 = _graph.add_node<ConstNode>(
+                                                    TensorDescriptor
+        {
+            TensorShape{ 2, 8112 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_detector_yolo_v3_Reshape_11 = _graph.node(id_detector_yolo_v3_Reshape_11);
+        node_detector_yolo_v3_Reshape_11->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_11", target });
+        node_detector_yolo_v3_Reshape_11->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_Reshape_11.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_mul_4 = _graph.add_node<ConstNode>(
+                                               TensorDescriptor
+        {
+            TensorShape{ 2, 2028 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_detector_yolo_v3_mul_4 = _graph.node(id_detector_yolo_v3_mul_4);
+        node_detector_yolo_v3_mul_4->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_4", target });
+        node_detector_yolo_v3_mul_4->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_4.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_Reshape_7 = _graph.add_node<ConstNode>(
+                                                   TensorDescriptor
+        {
+            TensorShape{ 2, 2028 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_detector_yolo_v3_Reshape_7 = _graph.node(id_detector_yolo_v3_Reshape_7);
+        node_detector_yolo_v3_Reshape_7->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_7", target });
+        node_detector_yolo_v3_Reshape_7->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_Reshape_7.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_mul_1 = _graph.add_node<ConstNode>(
+                                               TensorDescriptor
+        {
+            TensorShape{ 2, 507 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_detector_yolo_v3_mul_1 = _graph.node(id_detector_yolo_v3_mul_1);
+        node_detector_yolo_v3_mul_1->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_1", target });
+        node_detector_yolo_v3_mul_1->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_mul_1.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_Reshape_3 = _graph.add_node<ConstNode>(
+                                                   TensorDescriptor
+        {
+            TensorShape{ 2, 507 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_detector_yolo_v3_Reshape_3 = _graph.node(id_detector_yolo_v3_Reshape_3);
+        node_detector_yolo_v3_Reshape_3->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_3", target });
+        node_detector_yolo_v3_Reshape_3->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/detector_yolo-v3_Reshape_3.npy", data_layout));
+
+        NodeID id_input_to_detector_3 = _graph.add_node<InputNode>(
+                                            TensorDescriptor
+        {
+            TensorShape{ 255, 52, 52, 1 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_input_to_detector_3 = _graph.node(id_input_to_detector_3);
+        node_input_to_detector_3->set_common_node_parameters(NodeParams{ "input_to_detector_3", target });
+        node_input_to_detector_3->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/input_to_detector_3.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_Reshape_10 = _graph.add_node<ReshapeLayerNode>(
+                                                    TensorShape{ 85, 8112 });
+        INode *node_detector_yolo_v3_Reshape_10 = _graph.node(id_detector_yolo_v3_Reshape_10);
+        node_detector_yolo_v3_Reshape_10->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_10", target });
+        _graph.add_connection(id_input_to_detector_3, 0, id_detector_yolo_v3_Reshape_10, 0);
+
+        NodeID id_detector_yolo_v3_split_2 = _graph.add_node<SplitLayerNode>(
+                                                 4,
+                                                 0,
+                                                 std::vector<int> { 2, 2, 1, 80 });
+        INode *node_detector_yolo_v3_split_2 = _graph.node(id_detector_yolo_v3_split_2);
+        node_detector_yolo_v3_split_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_split_2", target });
+        _graph.add_connection(id_detector_yolo_v3_Reshape_10, 0, id_detector_yolo_v3_split_2, 0);
+
+        NodeID id_detector_yolo_v3_Sigmoid_6 = _graph.add_node<ActivationLayerNode>(
+                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
+        INode *node_detector_yolo_v3_Sigmoid_6 = _graph.node(id_detector_yolo_v3_Sigmoid_6);
+        node_detector_yolo_v3_Sigmoid_6->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_6", target });
+        _graph.add_connection(id_detector_yolo_v3_split_2, 0, id_detector_yolo_v3_Sigmoid_6, 0);
+
+        NodeID id_detector_yolo_v3_add_2 = _graph.add_node<EltwiseLayerNode>(
+                                               descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Add,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_add_2 = _graph.node(id_detector_yolo_v3_add_2);
+        node_detector_yolo_v3_add_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_add_2", target });
+        _graph.add_connection(id_detector_yolo_v3_Sigmoid_6, 0, id_detector_yolo_v3_add_2, 0);
+        _graph.add_connection(id_detector_yolo_v3_Reshape_11, 0, id_detector_yolo_v3_add_2, 1);
+
+        NodeID id_detector_yolo_v3_mul_6 = _graph.add_node<EltwiseLayerNode>(
+                                               descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Mul,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_mul_6 = _graph.node(id_detector_yolo_v3_mul_6);
+        node_detector_yolo_v3_mul_6->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_6", target });
+        _graph.add_connection(id_detector_yolo_v3_add_2, 0, id_detector_yolo_v3_mul_6, 0);
+        _graph.add_connection(id_detector_yolo_v3_mul_6_y, 0, id_detector_yolo_v3_mul_6, 1);
+
+        NodeID id_detector_yolo_v3_Sigmoid_7 = _graph.add_node<ActivationLayerNode>(
+                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
+        INode *node_detector_yolo_v3_Sigmoid_7 = _graph.node(id_detector_yolo_v3_Sigmoid_7);
+        node_detector_yolo_v3_Sigmoid_7->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_7", target });
+        _graph.add_connection(id_detector_yolo_v3_split_2, 2, id_detector_yolo_v3_Sigmoid_7, 0);
+
+        NodeID id_detector_yolo_v3_Exp_2 = _graph.add_node<UnaryEltwiseLayerNode>(
+                                               descriptors::UnaryEltwiseLayerDescriptor
+        {
+            UnaryEltwiseOperation::Exp,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_Exp_2 = _graph.node(id_detector_yolo_v3_Exp_2);
+        node_detector_yolo_v3_Exp_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Exp_2", target });
+        _graph.add_connection(id_detector_yolo_v3_split_2, 1, id_detector_yolo_v3_Exp_2, 0);
+
+        NodeID id_detector_yolo_v3_mul_8 = _graph.add_node<EltwiseLayerNode>(
+                                               descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Mul,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_mul_8 = _graph.node(id_detector_yolo_v3_mul_8);
+        node_detector_yolo_v3_mul_8->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_8", target });
+        _graph.add_connection(id_detector_yolo_v3_Exp_2, 0, id_detector_yolo_v3_mul_8, 0);
+        _graph.add_connection(id_detector_yolo_v3_mul_7, 0, id_detector_yolo_v3_mul_8, 1);
+
+        NodeID id_detector_yolo_v3_Sigmoid_8 = _graph.add_node<ActivationLayerNode>(
+                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
+        INode *node_detector_yolo_v3_Sigmoid_8 = _graph.node(id_detector_yolo_v3_Sigmoid_8);
+        node_detector_yolo_v3_Sigmoid_8->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_8", target });
+        _graph.add_connection(id_detector_yolo_v3_split_2, 3, id_detector_yolo_v3_Sigmoid_8, 0);
+
+        NodeID id_detector_yolo_v3_concat_8 = _graph.add_node<ConcatenateLayerNode>(
+                                                  4,
+                                                  descriptors::ConcatLayerDescriptor{ x_dim });
+        INode *node_detector_yolo_v3_concat_8 = _graph.node(id_detector_yolo_v3_concat_8);
+        node_detector_yolo_v3_concat_8->set_common_node_parameters(NodeParams{ "detector_yolo_v3_concat_8", target });
+        _graph.add_connection(id_detector_yolo_v3_mul_6, 0, id_detector_yolo_v3_concat_8, 0);
+        _graph.add_connection(id_detector_yolo_v3_mul_8, 0, id_detector_yolo_v3_concat_8, 1);
+        _graph.add_connection(id_detector_yolo_v3_Sigmoid_7, 0, id_detector_yolo_v3_concat_8, 2);
+        _graph.add_connection(id_detector_yolo_v3_Sigmoid_8, 0, id_detector_yolo_v3_concat_8, 3);
+
+        NodeID id_input_to_detector_2 = _graph.add_node<InputNode>(
+                                            TensorDescriptor
+        {
+            TensorShape{ 255, 26, 26, 1 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_input_to_detector_2 = _graph.node(id_input_to_detector_2);
+        node_input_to_detector_2->set_common_node_parameters(NodeParams{ "input_to_detector_2", target });
+        node_input_to_detector_2->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/input_to_detector_2.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_Reshape_6 = _graph.add_node<ReshapeLayerNode>(
+                                                   TensorShape{ 85, 2028 });
+        INode *node_detector_yolo_v3_Reshape_6 = _graph.node(id_detector_yolo_v3_Reshape_6);
+        node_detector_yolo_v3_Reshape_6->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_6", target });
+        _graph.add_connection(id_input_to_detector_2, 0, id_detector_yolo_v3_Reshape_6, 0);
+
+        NodeID id_detector_yolo_v3_split_1 = _graph.add_node<SplitLayerNode>(
+                                                 4,
+                                                 0,
+                                                 std::vector<int> { 2, 2, 1, 80 });
+        INode *node_detector_yolo_v3_split_1 = _graph.node(id_detector_yolo_v3_split_1);
+        node_detector_yolo_v3_split_1->set_common_node_parameters(NodeParams{ "detector_yolo_v3_split_1", target });
+        _graph.add_connection(id_detector_yolo_v3_Reshape_6, 0, id_detector_yolo_v3_split_1, 0);
+
+        NodeID id_detector_yolo_v3_Sigmoid_3 = _graph.add_node<ActivationLayerNode>(
+                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
+        INode *node_detector_yolo_v3_Sigmoid_3 = _graph.node(id_detector_yolo_v3_Sigmoid_3);
+        node_detector_yolo_v3_Sigmoid_3->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_3", target });
+        _graph.add_connection(id_detector_yolo_v3_split_1, 0, id_detector_yolo_v3_Sigmoid_3, 0);
+
+        NodeID id_detector_yolo_v3_add_1 = _graph.add_node<EltwiseLayerNode>(
+                                               descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Add,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_add_1 = _graph.node(id_detector_yolo_v3_add_1);
+        node_detector_yolo_v3_add_1->set_common_node_parameters(NodeParams{ "detector_yolo_v3_add_1", target });
+        _graph.add_connection(id_detector_yolo_v3_Sigmoid_3, 0, id_detector_yolo_v3_add_1, 0);
+        _graph.add_connection(id_detector_yolo_v3_Reshape_7, 0, id_detector_yolo_v3_add_1, 1);
+
+        NodeID id_detector_yolo_v3_mul_3 = _graph.add_node<EltwiseLayerNode>(
+                                               descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Mul,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_mul_3 = _graph.node(id_detector_yolo_v3_mul_3);
+        node_detector_yolo_v3_mul_3->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_3", target });
+        _graph.add_connection(id_detector_yolo_v3_add_1, 0, id_detector_yolo_v3_mul_3, 0);
+        _graph.add_connection(id_detector_yolo_v3_mul_3_y, 0, id_detector_yolo_v3_mul_3, 1);
+
+        NodeID id_detector_yolo_v3_Sigmoid_4 = _graph.add_node<ActivationLayerNode>(
+                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
+        INode *node_detector_yolo_v3_Sigmoid_4 = _graph.node(id_detector_yolo_v3_Sigmoid_4);
+        node_detector_yolo_v3_Sigmoid_4->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_4", target });
+        _graph.add_connection(id_detector_yolo_v3_split_1, 2, id_detector_yolo_v3_Sigmoid_4, 0);
+
+        NodeID id_detector_yolo_v3_Exp_1 = _graph.add_node<UnaryEltwiseLayerNode>(
+                                               descriptors::UnaryEltwiseLayerDescriptor
+        {
+            UnaryEltwiseOperation::Exp,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_Exp_1 = _graph.node(id_detector_yolo_v3_Exp_1);
+        node_detector_yolo_v3_Exp_1->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Exp_1", target });
+        _graph.add_connection(id_detector_yolo_v3_split_1, 1, id_detector_yolo_v3_Exp_1, 0);
+
+        NodeID id_detector_yolo_v3_mul_5 = _graph.add_node<EltwiseLayerNode>(
+                                               descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Mul,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_mul_5 = _graph.node(id_detector_yolo_v3_mul_5);
+        node_detector_yolo_v3_mul_5->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_5", target });
+        _graph.add_connection(id_detector_yolo_v3_Exp_1, 0, id_detector_yolo_v3_mul_5, 0);
+        _graph.add_connection(id_detector_yolo_v3_mul_4, 0, id_detector_yolo_v3_mul_5, 1);
+
+        NodeID id_detector_yolo_v3_Sigmoid_5 = _graph.add_node<ActivationLayerNode>(
+                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
+        INode *node_detector_yolo_v3_Sigmoid_5 = _graph.node(id_detector_yolo_v3_Sigmoid_5);
+        node_detector_yolo_v3_Sigmoid_5->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_5", target });
+        _graph.add_connection(id_detector_yolo_v3_split_1, 3, id_detector_yolo_v3_Sigmoid_5, 0);
+
+        NodeID id_detector_yolo_v3_concat_5 = _graph.add_node<ConcatenateLayerNode>(
+                                                  4,
+                                                  descriptors::ConcatLayerDescriptor{ x_dim });
+        INode *node_detector_yolo_v3_concat_5 = _graph.node(id_detector_yolo_v3_concat_5);
+        node_detector_yolo_v3_concat_5->set_common_node_parameters(NodeParams{ "detector_yolo_v3_concat_5", target });
+        _graph.add_connection(id_detector_yolo_v3_mul_3, 0, id_detector_yolo_v3_concat_5, 0);
+        _graph.add_connection(id_detector_yolo_v3_mul_5, 0, id_detector_yolo_v3_concat_5, 1);
+        _graph.add_connection(id_detector_yolo_v3_Sigmoid_4, 0, id_detector_yolo_v3_concat_5, 2);
+        _graph.add_connection(id_detector_yolo_v3_Sigmoid_5, 0, id_detector_yolo_v3_concat_5, 3);
+
+        NodeID id_input_to_detector_1 = _graph.add_node<InputNode>(
+                                            TensorDescriptor
+        {
+            TensorShape{ 255, 13, 13, 1 },
+            DataType::F32,
+            QuantizationInfo(),
+            data_layout });
+        INode *node_input_to_detector_1 = _graph.node(id_input_to_detector_1);
+        node_input_to_detector_1->set_common_node_parameters(NodeParams{ "input_to_detector_1", target });
+        node_input_to_detector_1->output(0)->set_accessor(get_weights_accessor(data_path, "/cnn_data/yolov3_output_detector/input_to_detector_1.npy", data_layout));
+
+        NodeID id_detector_yolo_v3_Reshape_2 = _graph.add_node<ReshapeLayerNode>(
+                                                   TensorShape{ 85, 507 });
+        INode *node_detector_yolo_v3_Reshape_2 = _graph.node(id_detector_yolo_v3_Reshape_2);
+        node_detector_yolo_v3_Reshape_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Reshape_2", target });
+        _graph.add_connection(id_input_to_detector_1, 0, id_detector_yolo_v3_Reshape_2, 0);
+
+        NodeID id_detector_yolo_v3_split = _graph.add_node<SplitLayerNode>(
+                                               4,
+                                               0,
+                                               std::vector<int> { 2, 2, 1, 80 });
+        INode *node_detector_yolo_v3_split = _graph.node(id_detector_yolo_v3_split);
+        node_detector_yolo_v3_split->set_common_node_parameters(NodeParams{ "detector_yolo_v3_split", target });
+        _graph.add_connection(id_detector_yolo_v3_Reshape_2, 0, id_detector_yolo_v3_split, 0);
+
+        NodeID id_detector_yolo_v3_Sigmoid = _graph.add_node<ActivationLayerNode>(
+                                                 ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
+        INode *node_detector_yolo_v3_Sigmoid = _graph.node(id_detector_yolo_v3_Sigmoid);
+        node_detector_yolo_v3_Sigmoid->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid", target });
+        _graph.add_connection(id_detector_yolo_v3_split, 0, id_detector_yolo_v3_Sigmoid, 0);
+
+        NodeID id_detector_yolo_v3_add = _graph.add_node<EltwiseLayerNode>(
+                                             descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Add,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_add = _graph.node(id_detector_yolo_v3_add);
+        node_detector_yolo_v3_add->set_common_node_parameters(NodeParams{ "detector_yolo_v3_add", target });
+        _graph.add_connection(id_detector_yolo_v3_Sigmoid, 0, id_detector_yolo_v3_add, 0);
+        _graph.add_connection(id_detector_yolo_v3_Reshape_3, 0, id_detector_yolo_v3_add, 1);
+
+        NodeID id_detector_yolo_v3_mul = _graph.add_node<EltwiseLayerNode>(
+                                             descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Mul,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_mul = _graph.node(id_detector_yolo_v3_mul);
+        node_detector_yolo_v3_mul->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul", target });
+        _graph.add_connection(id_detector_yolo_v3_add, 0, id_detector_yolo_v3_mul, 0);
+        _graph.add_connection(id_detector_yolo_v3_mul_y, 0, id_detector_yolo_v3_mul, 1);
+
+        NodeID id_detector_yolo_v3_Sigmoid_1 = _graph.add_node<ActivationLayerNode>(
+                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
+        INode *node_detector_yolo_v3_Sigmoid_1 = _graph.node(id_detector_yolo_v3_Sigmoid_1);
+        node_detector_yolo_v3_Sigmoid_1->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_1", target });
+        _graph.add_connection(id_detector_yolo_v3_split, 2, id_detector_yolo_v3_Sigmoid_1, 0);
+
+        NodeID id_detector_yolo_v3_Exp = _graph.add_node<UnaryEltwiseLayerNode>(
+                                             descriptors::UnaryEltwiseLayerDescriptor
+        {
+            UnaryEltwiseOperation::Exp,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_Exp = _graph.node(id_detector_yolo_v3_Exp);
+        node_detector_yolo_v3_Exp->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Exp", target });
+        _graph.add_connection(id_detector_yolo_v3_split, 1, id_detector_yolo_v3_Exp, 0);
+
+        NodeID id_detector_yolo_v3_mul_2 = _graph.add_node<EltwiseLayerNode>(
+                                               descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Mul,
+            QuantizationInfo() });
+        INode *node_detector_yolo_v3_mul_2 = _graph.node(id_detector_yolo_v3_mul_2);
+        node_detector_yolo_v3_mul_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_mul_2", target });
+        _graph.add_connection(id_detector_yolo_v3_Exp, 0, id_detector_yolo_v3_mul_2, 0);
+        _graph.add_connection(id_detector_yolo_v3_mul_1, 0, id_detector_yolo_v3_mul_2, 1);
+
+        NodeID id_detector_yolo_v3_Sigmoid_2 = _graph.add_node<ActivationLayerNode>(
+                                                   ActivationLayerInfo{ ActivationLayerInfo::ActivationFunction::LOGISTIC, 0, 0 });
+        INode *node_detector_yolo_v3_Sigmoid_2 = _graph.node(id_detector_yolo_v3_Sigmoid_2);
+        node_detector_yolo_v3_Sigmoid_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_Sigmoid_2", target });
+        _graph.add_connection(id_detector_yolo_v3_split, 3, id_detector_yolo_v3_Sigmoid_2, 0);
+
+        NodeID id_detector_yolo_v3_concat_2 = _graph.add_node<ConcatenateLayerNode>(
+                                                  4,
+                                                  descriptors::ConcatLayerDescriptor{ x_dim });
+        INode *node_detector_yolo_v3_concat_2 = _graph.node(id_detector_yolo_v3_concat_2);
+        node_detector_yolo_v3_concat_2->set_common_node_parameters(NodeParams{ "detector_yolo_v3_concat_2", target });
+        _graph.add_connection(id_detector_yolo_v3_mul, 0, id_detector_yolo_v3_concat_2, 0);
+        _graph.add_connection(id_detector_yolo_v3_mul_2, 0, id_detector_yolo_v3_concat_2, 1);
+        _graph.add_connection(id_detector_yolo_v3_Sigmoid_1, 0, id_detector_yolo_v3_concat_2, 2);
+        _graph.add_connection(id_detector_yolo_v3_Sigmoid_2, 0, id_detector_yolo_v3_concat_2, 3);
+
+        NodeID id_detector_yolo_v3_concat_9 = _graph.add_node<ConcatenateLayerNode>(
+                                                  3,
+                                                  descriptors::ConcatLayerDescriptor{ y_dim });
+        INode *node_detector_yolo_v3_concat_9 = _graph.node(id_detector_yolo_v3_concat_9);
+        node_detector_yolo_v3_concat_9->set_common_node_parameters(NodeParams{ "detector_yolo_v3_concat_9", target });
+        _graph.add_connection(id_detector_yolo_v3_concat_2, 0, id_detector_yolo_v3_concat_9, 0);
+        _graph.add_connection(id_detector_yolo_v3_concat_5, 0, id_detector_yolo_v3_concat_9, 1);
+        _graph.add_connection(id_detector_yolo_v3_concat_8, 0, id_detector_yolo_v3_concat_9, 2);
+
+        NodeID id_split = _graph.add_node<SplitLayerNode>(
+                              5,
+                              0,
+                              std::vector<int> { 1, 1, 1, 1, -1 });
+        INode *node_split = _graph.node(id_split);
+        node_split->set_common_node_parameters(NodeParams{ "split", target });
+        _graph.add_connection(id_detector_yolo_v3_concat_9, 0, id_split, 0);
+
+        NodeID id_truediv = _graph.add_node<EltwiseLayerNode>(
+                                descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Mul,
+            QuantizationInfo() });
+        INode *node_truediv = _graph.node(id_truediv);
+        node_truediv->set_common_node_parameters(NodeParams{ "truediv", target });
+        _graph.add_connection(id_split, 2, id_truediv, 0);
+        _graph.add_connection(id_ConstantFolding_truediv_recip, 0, id_truediv, 1);
+
+        NodeID id_sub = _graph.add_node<EltwiseLayerNode>(
+                            descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Sub,
+            QuantizationInfo() });
+        INode *node_sub = _graph.node(id_sub);
+        node_sub->set_common_node_parameters(NodeParams{ "sub", target });
+        _graph.add_connection(id_split, 0, id_sub, 0);
+        _graph.add_connection(id_truediv, 0, id_sub, 1);
+
+        NodeID id_add = _graph.add_node<EltwiseLayerNode>(
+                            descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Add,
+            QuantizationInfo() });
+        INode *node_add = _graph.node(id_add);
+        node_add->set_common_node_parameters(NodeParams{ "add", target });
+        _graph.add_connection(id_split, 0, id_add, 0);
+        _graph.add_connection(id_truediv, 0, id_add, 1);
+
+        NodeID id_truediv_1 = _graph.add_node<EltwiseLayerNode>(
+                                  descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Mul,
+            QuantizationInfo() });
+        INode *node_truediv_1 = _graph.node(id_truediv_1);
+        node_truediv_1->set_common_node_parameters(NodeParams{ "truediv_1", target });
+        _graph.add_connection(id_split, 3, id_truediv_1, 0);
+        _graph.add_connection(id_ConstantFolding_truediv_1_recip, 0, id_truediv_1, 1);
+
+        NodeID id_sub_1 = _graph.add_node<EltwiseLayerNode>(
+                              descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Sub,
+            QuantizationInfo() });
+        INode *node_sub_1 = _graph.node(id_sub_1);
+        node_sub_1->set_common_node_parameters(NodeParams{ "sub_1", target });
+        _graph.add_connection(id_split, 1, id_sub_1, 0);
+        _graph.add_connection(id_truediv_1, 0, id_sub_1, 1);
+
+        NodeID id_add_1 = _graph.add_node<EltwiseLayerNode>(
+                              descriptors::EltwiseLayerDescriptor
+        {
+            EltwiseOperation::Add,
+            QuantizationInfo() });
+        INode *node_add_1 = _graph.node(id_add_1);
+        node_add_1->set_common_node_parameters(NodeParams{ "add_1", target });
+        _graph.add_connection(id_split, 1, id_add_1, 0);
+        _graph.add_connection(id_truediv_1, 0, id_add_1, 1);
+
+        NodeID id_output_boxes = _graph.add_node<ConcatenateLayerNode>(
+                                     5,
+                                     descriptors::ConcatLayerDescriptor{ x_dim });
+        INode *node_output_boxes = _graph.node(id_output_boxes);
+        node_output_boxes->set_common_node_parameters(NodeParams{ "output_boxes", target });
+        _graph.add_connection(id_sub, 0, id_output_boxes, 0);
+        _graph.add_connection(id_sub_1, 0, id_output_boxes, 1);
+        _graph.add_connection(id_add, 0, id_output_boxes, 2);
+        _graph.add_connection(id_add_1, 0, id_output_boxes, 3);
+        _graph.add_connection(id_split, 4, id_output_boxes, 4);
+
+        NodeID id_output_140640247016360   = _graph.add_node<OutputNode>();
+        INode *node_output_140640247016360 = _graph.node(id_output_140640247016360);
+        node_output_140640247016360->set_common_node_parameters(NodeParams{ "output_140640247016360", target });
+        _graph.add_connection(id_output_boxes, 0, id_output_140640247016360, 0);
+        node_output_140640247016360->input(0)->set_accessor(get_npy_output_accessor(expected_output_filename.value(), TensorShape(85U, 10647U), DataType::F32, data_layout));
+
+        return true;
+    }
+
+    Graph &graph()
+    {
+        return _graph;
+    }
+
+private:
+    Graph _graph;
+};
+class GraphYoloV3OutputDetectorExample : public Example
+{
+public:
+    GraphYoloV3OutputDetectorExample()
+        : cmd_parser(), common_opts(cmd_parser), common_params()
+    {
+        expected_output_filename = cmd_parser.add_option<SimpleOption<std::string>>("expected-output-filename", "");
+        expected_output_filename->set_help("Name of npy file containing the expected output to validate the graph output.");
+    }
+    GraphYoloV3OutputDetectorExample(const GraphYoloV3OutputDetectorExample &) = delete;
+    GraphYoloV3OutputDetectorExample &operator=(const GraphYoloV3OutputDetectorExample &) = delete;
+
+    bool do_setup(int argc, char **argv) override
+    {
+        // Parse arguments
+        cmd_parser.parse(argc, argv);
+        cmd_parser.validate();
+
+        // Consume common parameters
+        common_params = consume_common_graph_parameters(common_opts);
+
+        // Return when help menu is requested
+        if(common_params.help)
+        {
+            cmd_parser.print_help(argv[0]);
+            return false;
+        }
+
+        // Print parameter values
+        std::cout << common_params << std::endl;
+
+        model.setup(common_params, *expected_output_filename);
+
+        GraphConfig config;
+        config.num_threads = common_params.threads;
+        config.use_tuner   = common_params.enable_tuner;
+        config.tuner_mode  = common_params.tuner_mode;
+        config.tuner_file  = common_params.tuner_file;
+
+        context.set_config(config);
+
+        auto pass_manager = create_default_pass_manager(common_params.target, config);
+        manager.finalize_graph(model.graph(), context, pass_manager, common_params.target);
+
+        return true;
+    }
+
+    void do_run() override
+    {
+        manager.execute_graph(model.graph());
+    }
+
+private:
+    CommandLineParser  cmd_parser;
+    CommonGraphOptions common_opts;
+    CommonGraphParams  common_params;
+
+    GraphContext context{};
+    GraphManager manager{};
+
+    GraphYoloV3OutputDetector model{};
+
+    SimpleOption<std::string> *expected_output_filename{ nullptr };
+};
+
+int main(int argc, char **argv)
+{
+    return run_example<GraphYoloV3OutputDetectorExample>(argc, argv);
+}

diff --git a/examples/neon_cartoon_effect.cpp b/examples/neon_cartoon_effect.cpp
index 4285aa4..dd33885 100644
--- a/examples/neon_cartoon_effect.cpp
+++ b/examples/neon_cartoon_effect.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/neon_cnn.cpp b/examples/neon_cnn.cpp
index ee6f46d..85f8792 100644
--- a/examples/neon_cnn.cpp
+++ b/examples/neon_cnn.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/neon_convolution.cpp b/examples/neon_convolution.cpp
index 56b4ddc..0b33c76 100644
--- a/examples/neon_convolution.cpp
+++ b/examples/neon_convolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/neon_copy_objects.cpp b/examples/neon_copy_objects.cpp
index 84a2abd..2fbc128 100644
--- a/examples/neon_copy_objects.cpp
+++ b/examples/neon_copy_objects.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/neon_gemm_qasymm8.cpp b/examples/neon_gemm_qasymm8.cpp
index f028e00..efe1655 100644
--- a/examples/neon_gemm_qasymm8.cpp
+++ b/examples/neon_gemm_qasymm8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/neon_opticalflow.cpp b/examples/neon_opticalflow.cpp
index b5df819..ff9478c 100644
--- a/examples/neon_opticalflow.cpp
+++ b/examples/neon_opticalflow.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/neon_permute.cpp b/examples/neon_permute.cpp
index 05c8169..49848de 100644
--- a/examples/neon_permute.cpp
+++ b/examples/neon_permute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/neon_scale.cpp b/examples/neon_scale.cpp
index b04d916..ac9d062 100644
--- a/examples/neon_scale.cpp
+++ b/examples/neon_scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,7 +60,7 @@
         dst.allocator()->init(dst_tensor_info);
 
         // Configure Scale function object:
-        scale.configure(&src, &dst, InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+        scale.configure(&src, &dst, ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED });
 
         // Allocate all the images
         src.allocator()->allocate();

diff --git a/examples/neon_sgemm.cpp b/examples/neon_sgemm.cpp
index 8f395de..07696bd 100644
--- a/examples/neon_sgemm.cpp
+++ b/examples/neon_sgemm.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/examples/neoncl_scale_median_gaussian.cpp b/examples/neoncl_scale_median_gaussian.cpp
index 1b26517..df0eb96 100644
--- a/examples/neoncl_scale_median_gaussian.cpp
+++ b/examples/neoncl_scale_median_gaussian.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,7 +71,7 @@
         median_gauss.allocator()->init(scale_median_info);
         dst.allocator()->init(scale_median_info);
 
-        scale.configure(&src, &scale_median, InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::REPLICATE);
+        scale.configure(&src, &scale_median, ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::REPLICATE });
         median.configure(&scale_median, &median_gauss, BorderMode::REPLICATE);
         gauss.configure(&median_gauss, &dst, BorderMode::REPLICATE);
 

diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index a030f21..5e13aa0 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py

@@ -8,6 +8,8 @@
            "arm_compute/core/NEON/kernels/convolution/common " \
            "arm_compute/core/NEON/kernels/convolution/depthwise " \
            "arm_compute/core/NEON/kernels/convolution/winograd " \
+           "src/core/NEON/kernels/assembly " \
+           "src/core/NEON/kernels/convolution/winograd " \
            "include/linux include " \
            ". " \
            "3rdparty/include kernels".split()

diff --git a/scripts/copyright_eula.txt b/scripts/copyright_eula.txt
index 50e8090..16cd734 100644
--- a/scripts/copyright_eula.txt
+++ b/scripts/copyright_eula.txt

@@ -1,4 +1,4 @@
-Copyright (c) 2016, 2017 ARM Limited.
+Copyright (c) 2016, 2017 Arm Limited.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to

diff --git a/scripts/copyright_mit.txt b/scripts/copyright_mit.txt
index a43ef66..049d444 100644
--- a/scripts/copyright_mit.txt
+++ b/scripts/copyright_mit.txt

@@ -1,4 +1,4 @@
-Copyright (c) 2018 ARM Limited.
+Copyright (c) 2018 Arm Limited.
 
 SPDX-License-Identifier: MIT
 

diff --git a/scripts/enable_tracing.py b/scripts/enable_tracing.py
index 3379e53..842c58e 100755
--- a/scripts/enable_tracing.py
+++ b/scripts/enable_tracing.py

@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2020 ARM Limited.
+# Copyright (c) 2020 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #

diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index cfb36e1..85c5b27 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index a3a0f28..10e88b8 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index 70235a2..4c03ca1 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index 32a407d..0afb7e5 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/CLCoreRuntimeContext.cpp b/src/core/CL/CLCoreRuntimeContext.cpp
index f9efad2..6b1b1f5 100644
--- a/src/core/CL/CLCoreRuntimeContext.cpp
+++ b/src/core/CL/CLCoreRuntimeContext.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 15f45d5..895bb72 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -218,11 +218,6 @@
     }
 }
 
-std::string get_underlying_cl_type_from_data_type(const DataType &dt)
-{
-    return get_cl_type_from_data_type(dt);
-}
-
 GPUTarget get_target_from_device(const cl::Device &device)
 {
     // Query device name size
@@ -370,6 +365,27 @@
     return true;
 }
 
+bool image2d_from_buffer_supported(const cl::Device &device)
+{
+    return device_supports_extension(device, "cl_khr_image2d_from_buffer");
+}
+
+size_t get_cl_image_pitch_alignment(const cl::Device &device)
+{
+    cl_uint pixel_aligment = 0;
+
+    cl_int err = clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &pixel_aligment, nullptr);
+
+    if(err == CL_SUCCESS)
+    {
+        return pixel_aligment;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
 cl::Kernel create_opencl_kernel(CLCoreRuntimeContext *ctx, const std::string &kernel_name, const CLBuildOptions &build_opts)
 {
     if(ctx && ctx->kernel_library())

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index d4073c6..0b59ec8 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -203,7 +203,6 @@
     { "gather", "gather.cl" },
     { "gaussian1x5_sub_x", "gaussian_pyramid.cl" },
     { "gaussian5x1_sub_y", "gaussian_pyramid.cl" },
-    { "gemm_accumulate_biases", "gemm.cl" },
     { "gemm_ma_f16", "gemm.cl" },
     { "gemm_ma_f32", "gemm.cl" },
     { "gemm_mv", "gemv.cl" },
@@ -220,9 +219,13 @@
     { "gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl" },
     { "gemm_mm_native", "gemm.cl" },
     { "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" },
+    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "gemm.cl" },
     { "gemm_mm_reshaped_lhs_t_rhs_nt", "gemm.cl" },
+    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_nt", "gemm.cl" },
+    { "gemm_mm_reshaped_only_rhs_nt_texture", "gemm.cl" },
     { "gemm_mm_reshaped_only_rhs_t", "gemm.cl" },
+    { "gemm_mm_reshaped_only_rhs_t_texture", "gemm.cl" },
     { "gemm_lc_vm_f32", "gemm.cl" },
     { "gemm_reshape_lhs_matrix_nt", "gemm.cl" },
     { "gemm_reshape_lhs_matrix_t", "gemm.cl" },
@@ -280,6 +283,7 @@
     { "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
     { "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
     { "magnitude_phase", "magnitude_phase.cl" },
+    { "max_unpooling_layer_2", "unpooling_layer.cl" },
     { "mean_stddev_accumulate", "mean_stddev.cl" },
     { "mean_stddev_normalization", "mean_stddev_normalization.cl" },
     { "memset", "memset.cl" },
@@ -322,6 +326,10 @@
     { "pooling_layer_7", "pooling_layer.cl" },
     { "pooling_layer_MxN_nchw", "pooling_layer.cl" },
     { "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
+    { "pooling_layer_2_nhwc_indices_fp32", "pooling_layer.cl" },
+    { "pooling_layer_2_nhwc_indices_fp16", "pooling_layer.cl" },
+    { "pooling_layer_2_nchw_indices_fp32", "pooling_layer.cl" },
+    { "pooling_layer_2_nchw_indices_fp16", "pooling_layer.cl" },
     { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
     { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
     { "prior_box_layer_nchw", "prior_box_layer.cl" },
@@ -932,6 +940,10 @@
 #include "./cl_kernels/types.hembed"
     },
     {
+        "unpooling_layer.cl",
+#include "./cl_kernels/unpooling_layer.clembed"
+    },
+    {
         "warp_affine.cl",
 #include "./cl_kernels/warp_affine.clembed"
     },

diff --git a/src/core/CL/CLTracePoint.cpp b/src/core/CL/CLTracePoint.cpp
index b459cfb..631cb84 100644
--- a/src/core/CL/CLTracePoint.cpp
+++ b/src/core/CL/CLTracePoint.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
new file mode 100644
index 0000000..5d0cdf7
--- /dev/null
+++ b/src/core/CL/CLUtils.cpp

@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/CL/CLUtils.h"
+
+cl::Image2D arm_compute::create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, cl_channel_type data_type, size_t image_row_pitch)
+{
+    cl_mem cl_image;
+    cl_int err = CL_SUCCESS;
+
+    const cl_image_format format = { CL_RGBA, data_type };
+
+    cl_image_desc desc;
+    memset(&desc, 0, sizeof(desc));
+    desc.image_type      = CL_MEM_OBJECT_IMAGE2D;
+    desc.mem_object      = buffer();
+    desc.image_row_pitch = image_row_pitch;
+    desc.image_width     = shape2d[0];
+    desc.image_height    = shape2d[1];
+
+    cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
+
+    ARM_COMPUTE_UNUSED(err);
+    ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Error during the creation of CL image from buffer");
+
+    return cl::Image2D(cl_image);
+}

diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
new file mode 100644
index 0000000..8f1c58b
--- /dev/null
+++ b/src/core/CL/CLUtils.h

@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CL_CLUTILS_H
+#define ARM_COMPUTE_CL_CLUTILS_H
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+namespace arm_compute
+{
+class TensorShape;
+
+/** Create a cl::Image2D object from an OpenCL buffer
+ *
+ * @note The following conditions are required to create a OpenCL image object from OpenCL buffer,
+ *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+ *       -# The stride Y for the input1 should satisfy the OpenCL pitch alignment requirement
+ *       -# input width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+ *       -# input height should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+ *
+ * It is user responsibility to ensure the above conditions are satisfied since no checks are performed within this function
+ *
+ * @param[in] ctx             cl::Context object
+ * @param[in] buffer          cl::Buffer object from which the OpenCL image2d object is created
+ * @param[in] shape2d         2D tensor shape
+ * @param[in] data_type       cl_channel_type to use. Only supported CL_FLOAT
+ * @param[in] image_row_pitch Image row pitch (a.k.a. stride Y) to be used in the image2d object
+ *
+ * @return cl::Image2D object
+ */
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, cl_channel_type data_type, size_t image_row_pitch);
+
+} // arm_compute
+
+#endif /* ARM_COMPUTE_CL_CLUTILS_H */

diff --git a/src/core/CL/ICLDistribution1D.cpp b/src/core/CL/ICLDistribution1D.cpp
index a645d0e..d185f13 100644
--- a/src/core/CL/ICLDistribution1D.cpp
+++ b/src/core/CL/ICLDistribution1D.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/ICLHOG.cpp b/src/core/CL/ICLHOG.cpp
index e182997..aaabe86 100644
--- a/src/core/CL/ICLHOG.cpp
+++ b/src/core/CL/ICLHOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index ea9c62a..be63374 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/ICLLut.cpp b/src/core/CL/ICLLut.cpp
index ea9deac..007a524 100644
--- a/src/core/CL/ICLLut.cpp
+++ b/src/core/CL/ICLLut.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/ICLMultiHOG.cpp b/src/core/CL/ICLMultiHOG.cpp
index 8ece566..73bee39 100644
--- a/src/core/CL/ICLMultiHOG.cpp
+++ b/src/core/CL/ICLMultiHOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/ICLMultiImage.cpp b/src/core/CL/ICLMultiImage.cpp
index dbf3fe3..01b05fc 100644
--- a/src/core/CL/ICLMultiImage.cpp
+++ b/src/core/CL/ICLMultiImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
index cf6c9c8..ce95495 100644
--- a/src/core/CL/ICLSimple2DKernel.cpp
+++ b/src/core/CL/ICLSimple2DKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
index 4197307..3d08262 100644
--- a/src/core/CL/ICLSimple3DKernel.cpp
+++ b/src/core/CL/ICLSimple3DKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
index 48e5a88..d2f09a3 100644
--- a/src/core/CL/ICLSimpleKernel.cpp
+++ b/src/core/CL/ICLSimpleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp
index 4a7952e..b541bff 100644
--- a/src/core/CL/ICLTensor.cpp
+++ b/src/core/CL/ICLTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 9a3e344..6c70861 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -133,6 +133,7 @@
     LOAD_FUNCTION_PTR(clEnqueueSVMUnmap, handle);
     LOAD_FUNCTION_PTR(clEnqueueMarker, handle);
     LOAD_FUNCTION_PTR(clWaitForEvents, handle);
+    LOAD_FUNCTION_PTR(clCreateImage, handle);
 
     // Third-party extensions
     LOAD_FUNCTION_PTR(clImportMemoryARM, handle);
@@ -938,6 +939,30 @@
 }
 
 cl_mem
+clCreateImage(cl_context             context,
+              cl_mem_flags           flags,
+              const cl_image_format *image_format,
+              const cl_image_desc   *image_desc,
+              void                  *host_ptr,
+              cl_int                *errcode_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clCreateImage_ptr;
+    if(func != nullptr)
+    {
+        return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
+    }
+    else
+    {
+        if(errcode_ret != nullptr)
+        {
+            *errcode_ret = CL_OUT_OF_RESOURCES;
+        }
+        return nullptr;
+    }
+}
+
+cl_mem
 clImportMemoryARM(cl_context                      context,
                   cl_mem_flags                    flags,
                   const cl_import_properties_arm *properties,

diff --git a/src/core/CL/cl_kernels/absdiff.cl b/src/core/CL/cl_kernels/absdiff.cl
index 1761342..a09caf5 100644
--- a/src/core/CL/cl_kernels/absdiff.cl
+++ b/src/core/CL/cl_kernels/absdiff.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/accumulate.cl b/src/core/CL/cl_kernels/accumulate.cl
index 39c1512..9e37830 100644
--- a/src/core/CL/cl_kernels/accumulate.cl
+++ b/src/core/CL/cl_kernels/accumulate.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h
index a1e742d..bedde83 100644
--- a/src/core/CL/cl_kernels/activation_float_helpers.h
+++ b/src/core/CL/cl_kernels/activation_float_helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
index d820753..f846cb2 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/activation_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/activation_layer_quant.cl b/src/core/CL/cl_kernels/activation_layer_quant.cl
index ebd3408..0481319 100644
--- a/src/core/CL/cl_kernels/activation_layer_quant.cl
+++ b/src/core/CL/cl_kernels/activation_layer_quant.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@
  * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
  * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
@@ -112,7 +112,7 @@
  * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
  * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/activation_quant_helpers.h b/src/core/CL/cl_kernels/activation_quant_helpers.h
index 7eaf082..a32e4e9 100644
--- a/src/core/CL/cl_kernels/activation_quant_helpers.h
+++ b/src/core/CL/cl_kernels/activation_quant_helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/arg_min_max.cl b/src/core/CL/cl_kernels/arg_min_max.cl
index 104d30d..5184e0c 100644
--- a/src/core/CL/cl_kernels/arg_min_max.cl
+++ b/src/core/CL/cl_kernels/arg_min_max.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -193,7 +193,7 @@
  * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the ArgMax
  * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the ArgMin
  *
- * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: S32/F16/F32
+ * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
  * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
@@ -262,7 +262,7 @@
         // Perform parallel reduction
         for(unsigned int i = middle; i > 0; i >>= 1)
         {
-            if( lid < i && lid + i < lsize)
+            if(lid < i && lid + i < lsize)
             {
                 DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
                 DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
@@ -297,7 +297,7 @@
  * @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
  *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: S32/F16/F32
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
  * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
@@ -345,7 +345,7 @@
  * @note The data type of the select results must be passed at compile time using -DDATA_TYPE_SELECT: e.g. -DDATA_TYPE_SELECT=int
  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -398,7 +398,7 @@
  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
  * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
  *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/batch_to_space.cl b/src/core/CL/cl_kernels/batch_to_space.cl
index b2ec8a7..8a71985 100644
--- a/src/core/CL/cl_kernels/batch_to_space.cl
+++ b/src/core/CL/cl_kernels/batch_to_space.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
index 918caff..ad27aa3 100644
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ b/src/core/CL/cl_kernels/batchnormalization_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -346,9 +346,9 @@
     int c0 = z % DIM2;
     int c1 = z / DIM2;
 #else // ! defined(DIM2)
-    int c0 = 0;
+    int c0                                                                                    = 0;
 #if defined(NHWC)
-    int c1 = x;
+    int c1                                                                                    = x;
 #else  // defined(NHWC)
     int c1 = z;
 #endif // defined(NHWC)
@@ -386,7 +386,7 @@
     // Compute bias
 #if !defined(DIM2) && defined(NHWC)
     if(z == 0 && y == 0)
-#else !defined(DIM2) && defined(NHWC)
+#else  // !defined(DIM2) && defined(NHWC)
     if(x == 0 && y == 0 && c0 == 0)
 #endif // !defined(DIM2) && defined(NHWC)
     {

diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/bitwise_op.cl
index 135bfa9..b88b3bc 100644
--- a/src/core/CL/cl_kernels/bitwise_op.cl
+++ b/src/core/CL/cl_kernels/bitwise_op.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/bounding_box_transform.cl
index e6f470a..a9b0496 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform.cl
+++ b/src/core/CL/cl_kernels/bounding_box_transform.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
index bebad62..9e5cee5 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
+++ b/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl
index 9bfa2f4..bcff843 100644
--- a/src/core/CL/cl_kernels/canny.cl
+++ b/src/core/CL/cl_kernels/canny.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/channel_combine.cl b/src/core/CL/cl_kernels/channel_combine.cl
index 4207414..550d52e 100644
--- a/src/core/CL/cl_kernels/channel_combine.cl
+++ b/src/core/CL/cl_kernels/channel_combine.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/channel_extract.cl b/src/core/CL/cl_kernels/channel_extract.cl
index e99e9eb..b64f248 100644
--- a/src/core/CL/cl_kernels/channel_extract.cl
+++ b/src/core/CL/cl_kernels/channel_extract.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
index 3ac67c5..9a87eb4 100644
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ b/src/core/CL/cl_kernels/channel_shuffle.cl

@@ -1,26 +1,26 @@
 /*
-* Copyright (c) 2018 ARM Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
+* Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
@@ -47,7 +47,7 @@
  * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
  *       K is equal to num_channels / num_groups.
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
@@ -110,7 +110,7 @@
  * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
  *       K is equal to num_channels / num_groups.
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl
index fb6dcc5..59c2d8a 100644
--- a/src/core/CL/cl_kernels/col2im.cl
+++ b/src/core/CL/cl_kernels/col2im.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/color_convert.cl b/src/core/CL/cl_kernels/color_convert.cl
index 7a872b4..cbebc88 100644
--- a/src/core/CL/cl_kernels/color_convert.cl
+++ b/src/core/CL/cl_kernels/color_convert.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/comparisons.cl
index a41b7e2..4088461 100644
--- a/src/core/CL/cl_kernels/comparisons.cl
+++ b/src/core/CL/cl_kernels/comparisons.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,7 @@
  * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
  * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -59,7 +59,7 @@
  * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -115,7 +115,7 @@
  * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
index 3684eb5..4281e67 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/concatenate.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -73,7 +73,7 @@
  * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
  * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
  *
- * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All.
  * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -155,7 +155,7 @@
  * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
  * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
  *
- * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -388,7 +388,7 @@
  * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
  * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/convert_fc_weights.cl
index db08737..a451c02 100644
--- a/src/core/CL/cl_kernels/convert_fc_weights.cl
+++ b/src/core/CL/cl_kernels/convert_fc_weights.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/convolution3x3.cl b/src/core/CL/cl_kernels/convolution3x3.cl
index 625c6c4..7bca567 100644
--- a/src/core/CL/cl_kernels/convolution3x3.cl
+++ b/src/core/CL/cl_kernels/convolution3x3.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/convolution5x5.cl b/src/core/CL/cl_kernels/convolution5x5.cl
index 2c3cafa..9995ebf 100644
--- a/src/core/CL/cl_kernels/convolution5x5.cl
+++ b/src/core/CL/cl_kernels/convolution5x5.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/convolution7x7.cl b/src/core/CL/cl_kernels/convolution7x7.cl
index 9dd6a88..50fb3d7 100644
--- a/src/core/CL/cl_kernels/convolution7x7.cl
+++ b/src/core/CL/cl_kernels/convolution7x7.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/convolution9x9.cl b/src/core/CL/cl_kernels/convolution9x9.cl
index 2a5f4a1..7e77c61 100644
--- a/src/core/CL/cl_kernels/convolution9x9.cl
+++ b/src/core/CL/cl_kernels/convolution9x9.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
index 874b78e..cfd1f12 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/convolution_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/convolution_rectangle.cl b/src/core/CL/cl_kernels/convolution_rectangle.cl
index f5a109f..925a698 100644
--- a/src/core/CL/cl_kernels/convolution_rectangle.cl
+++ b/src/core/CL/cl_kernels/convolution_rectangle.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
index f4366b8..0592e07 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/copy_tensor.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
  * -# -DDEPTH = The third dimension (depth) of the tensor (it is needed only if d == 3)
  * -# -DDATA_TYPE = Input and output datatypes.
  *
- * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -80,7 +80,7 @@
 #if defined(DATA_TYPE)
 /** Performs a copy of input tensor to the output tensor.
  *
- * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/crop_tensor.cl b/src/core/CL/cl_kernels/crop_tensor.cl
index 55f8544..62ae36a 100644
--- a/src/core/CL/cl_kernels/crop_tensor.cl
+++ b/src/core/CL/cl_kernels/crop_tensor.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,7 @@
 
 /** Performs a copy of input tensor to the output tensor.
  *
- * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: U16/S16/F16/U32/S32/F32
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -35,7 +35,7 @@
  * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: F32
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
index cb1abd1..b1d5e61 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/deconvolution_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/depth_convert.cl b/src/core/CL/cl_kernels/depth_convert.cl
index 5e9a3a1..75e6829 100644
--- a/src/core/CL/cl_kernels/depth_convert.cl
+++ b/src/core/CL/cl_kernels/depth_convert.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/depth_to_space.cl
index 5c2e8a1..d3231a5 100644
--- a/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/src/core/CL/cl_kernels/depth_to_space.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl
index c050175..e1f6505 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -692,7 +692,7 @@
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4
  * @attention Input's height and width should be 3
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 97015fe..d4bea4b 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -144,7 +144,7 @@
 
 /** This function computes the depthwise convolution quantized.
  *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
@@ -152,7 +152,7 @@
  * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                                       src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
@@ -160,7 +160,7 @@
  * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
@@ -461,9 +461,7 @@
 #endif /*DILATION_X==1*/
 /** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
  *
- * @note Per-channel quantization is not supported by this kernel.
- *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
@@ -471,7 +469,7 @@
  * @param[in] src_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
  * @param[in] src_step_z                                       src_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes                The offset of the first element in the source tensor
- * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in] dst_ptr                                          Pointer to the destination tensor. Supported data types: same as @p src_ptr
  * @param[in] dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
  * @param[in] dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
@@ -479,7 +477,7 @@
  * @param[in] dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
  * @param[in] dst_step_z                                       dst_stride_z * number of elements along Y processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
@@ -789,7 +787,7 @@
  * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
  * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
  *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
@@ -809,7 +807,7 @@
  * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
  * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor reshaped. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_ptr                                      Pointer to the weights tensor reshaped. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
@@ -1028,7 +1026,7 @@
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).
  *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)
@@ -1048,7 +1046,7 @@
  * @param[in] dst_stride_w                                     Stride of the destination tensor in W dimension (in bytes)
  * @param[in] dst_step_w                                       dst_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL
+ * @param[in] weights_ptr                                      Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
  * @param[in] weights_stride_x                                 Stride of the weights tensor in X dimension (in bytes)
  * @param[in] weights_step_x                                   weights_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] weights_stride_y                                 Stride of the weights tensor in Y dimension (in bytes)
@@ -1378,7 +1376,7 @@
  * @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.
  *       If not, the quantization will be performed using a fixed point multiplication
  *
- * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in] src_ptr                                          Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in] src_stride_x                                     Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                                     Stride of the source tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
index add86e3..127f67d 100644
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ b/src/core/CL/cl_kernels/dequantization_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/derivative.cl b/src/core/CL/cl_kernels/derivative.cl
index cd2091e..dddbb4d 100644
--- a/src/core/CL/cl_kernels/derivative.cl
+++ b/src/core/CL/cl_kernels/derivative.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/dilate.cl b/src/core/CL/cl_kernels/dilate.cl
index c62c701..14362c1 100644
--- a/src/core/CL/cl_kernels/dilate.cl
+++ b/src/core/CL/cl_kernels/dilate.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
index cceeb0f..d0eea5b 100644
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ b/src/core/CL/cl_kernels/direct_convolution1x1.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
index 08d25f6..da7a1e7 100644
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ b/src/core/CL/cl_kernels/direct_convolution3x3.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
index 5299409..e5c7a51 100644
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/direct_convolution9x9.cl b/src/core/CL/cl_kernels/direct_convolution9x9.cl
index 8d0417a..64da38d 100644
--- a/src/core/CL/cl_kernels/direct_convolution9x9.cl
+++ b/src/core/CL/cl_kernels/direct_convolution9x9.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #undef CONVERT_SAT
 
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC)
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC) && defined(PAD_TOP)
 
 #define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR))
 
@@ -288,134 +288,52 @@
 
     weights_addr += id0 * weights_stride_w;
 
-#if(PAD_TOP == 1)
-    const int coordy = id2 - PAD_TOP;
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
+    const int coordy = (id2 * STRIDE_Y) - PAD_TOP;
+    if(coordy < 0)
     {
-        if(coordy < 0) // special case Z = -1 doesn't exists
+        // Skip first rows containing padding
+        for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
         {
-            //skip first row and load the two next ones
-            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
+            const int start_z = -coordy;
+            for(int i = start_z; i < 9; ++i)
+            {
+                CONVOLUTION1x9_NHWC(values, (src_addr + i * (int)src_stride_z), (weights_addr + i * (int)weights_stride_z));
+            }
+            src_addr += STEP_X * sizeof(DATA_TYPE);
+            weights_addr += STEP_X * sizeof(DATA_TYPE);
         }
-        else if(coordy == (DST_HEIGHT - PAD_TOP - 1))
-        {
-            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
-            // Z axis has no padding at all.
-            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
-            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
-        }
-        else
-        {
-            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
-            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
-        }
-        src_addr += STEP_X * sizeof(DATA_TYPE);
-        weights_addr += STEP_X * sizeof(DATA_TYPE);
     }
-#elif(PAD_TOP == 2) // PAD_TOP == 1
-    const int coordy = id2 * STRIDE_Y;
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
+    else if(coordy > (SRC_HEIGHT - 9))
     {
-        if(coordy == 0) // special case Z = -2 doesn't exists
+        for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
         {
-            //skip first row and load the two next ones
-            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
+            // Avoid loading rows beyond the input height
+            const int end_z = SRC_HEIGHT - coordy;
+            for(int i = 0; i < end_z; ++i)
+            {
+                CONVOLUTION1x9_NHWC(values, (src_addr + i * (int)src_stride_z), (weights_addr + i * (int)weights_stride_z));
+            }
+            src_addr += STEP_X * sizeof(DATA_TYPE);
+            weights_addr += STEP_X * sizeof(DATA_TYPE);
         }
-        else if(coordy == 1) // special case Z = -1 doesn't exists
-        {
-            //skip first row and load the two next ones
-            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
-        }
-        else if(coordy == (SRC_HEIGHT - 5))
-        {
-            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
-            // Z axis has no padding at all.
-            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
-            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-        }
-        else if(coordy == (SRC_HEIGHT - 6))
-        {
-            // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the
-            // Z axis has no padding at all.
-            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
-            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
-        }
-        else
-        {
-            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
-            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
-            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
-        }
-        src_addr += STEP_X * sizeof(DATA_TYPE);
-        weights_addr += STEP_X * sizeof(DATA_TYPE);
     }
-
-#else  // PAD_TOP == 1
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
+    else
     {
-        CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
-        CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
-        CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
-        CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
-        CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
-        CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
-        CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
-        CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
-        CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
-        src_addr += STEP_X * sizeof(DATA_TYPE);
-        weights_addr += STEP_X * sizeof(DATA_TYPE);
+        for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X)
+        {
+            CONVOLUTION1x9_NHWC(values, src_addr, weights_addr);
+            CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
+            CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
+            src_addr += STEP_X * sizeof(DATA_TYPE);
+            weights_addr += STEP_X * sizeof(DATA_TYPE);
+        }
     }
-#endif // PAD_TOP == 1
 
 #if defined(VEC_SIZE)
     REDUCE(values.s0, values0);
@@ -443,4 +361,4 @@
     *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values.s7;
 #undef STEP_X
 }
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC)
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC) && defined(PAD_TOP)

diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_quantized.cl
index e48c26e..8237fe1 100644
--- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/direct_convolution_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,113 @@
 
 #if defined(DATA_LAYOUT_NHWC)
 
-#if KERNEL_SIZE == 5
+#if KERNEL_SIZE == 9
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x9(acc, src_ptr, weights_ptr) CONVOLUTION1x9_STRIDE1(acc, src_ptr, weights_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x9(acc, src_ptr, weights_ptr) CONVOLUTION1x9_STRIDE2(acc, src_ptr, weights_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x9_STRIDE1(acc, src_ptr, weights_ptr)                          \
+    ({                                                                             \
+        int8 weights_values0 = 0;                                                  \
+        int  weights_value1  = 0;                                                  \
+        weights_values0.s0   = convert_int(*(weights_ptr + 0 * weights_stride_y)); \
+        weights_values0.s1   = convert_int(*(weights_ptr + 1 * weights_stride_y)); \
+        weights_values0.s2   = convert_int(*(weights_ptr + 2 * weights_stride_y)); \
+        weights_values0.s3   = convert_int(*(weights_ptr + 3 * weights_stride_y)); \
+        weights_values0.s4   = convert_int(*(weights_ptr + 4 * weights_stride_y)); \
+        weights_values0.s5   = convert_int(*(weights_ptr + 5 * weights_stride_y)); \
+        weights_values0.s6   = convert_int(*(weights_ptr + 6 * weights_stride_y)); \
+        weights_values0.s7   = convert_int(*(weights_ptr + 7 * weights_stride_y)); \
+        weights_value1       = convert_int(*(weights_ptr + 8 * weights_stride_y)); \
+        \
+        int8 src0 = 0;                                                             \
+        int8 src1 = 0;                                                             \
+        src0.s0   = convert_int(*(src_ptr + 0 * weights_stride_y));                \
+        src0.s1   = convert_int(*(src_ptr + 1 * weights_stride_y));                \
+        src0.s2   = convert_int(*(src_ptr + 2 * weights_stride_y));                \
+        src0.s3   = convert_int(*(src_ptr + 3 * weights_stride_y));                \
+        src0.s4   = convert_int(*(src_ptr + 4 * weights_stride_y));                \
+        src0.s5   = convert_int(*(src_ptr + 5 * weights_stride_y));                \
+        src0.s6   = convert_int(*(src_ptr + 6 * weights_stride_y));                \
+        src0.s7   = convert_int(*(src_ptr + 7 * weights_stride_y));                \
+        src1.s0   = convert_int(*(src_ptr + 8 * weights_stride_y));                \
+        src1.s1   = convert_int(*(src_ptr + 9 * weights_stride_y));                \
+        src1.s2   = convert_int(*(src_ptr + 10 * weights_stride_y));               \
+        src1.s3   = convert_int(*(src_ptr + 11 * weights_stride_y));               \
+        src1.s4   = convert_int(*(src_ptr + 12 * weights_stride_y));               \
+        src1.s5   = convert_int(*(src_ptr + 13 * weights_stride_y));               \
+        src1.s6   = convert_int(*(src_ptr + 14 * weights_stride_y));               \
+        src1.s7   = convert_int(*(src_ptr + 15 * weights_stride_y));               \
+        \
+        acc += src0 * (int8)weights_values0.s0;                                    \
+        acc += (int8)(src0.s1234, src0.s567, src1.s0) * (int8)weights_values0.s1;  \
+        acc += (int8)(src0.s234, src0.s567, src1.s01) * (int8)weights_values0.s2;  \
+        acc += (int8)(src0.s345, src0.s67, src1.s012) * (int8)weights_values0.s3;  \
+        acc += (int8)(src0.s4567, src1.s0123) * (int8)weights_values0.s4;          \
+        acc += (int8)(src0.s567, src1.s0123, src1.s4) * (int8)weights_values0.s5;  \
+        acc += (int8)(src0.s67, src1.s012, src1.s345) * (int8)weights_values0.s6;  \
+        acc += (int8)(src0.s7, src1.s0123, src1.s456) * (int8)weights_values0.s7;  \
+        acc += src1 * (int8)weights_value1;                                        \
+    })
+
+#define CONVOLUTION1x9_STRIDE2(acc, src_ptr, weights_ptr)                          \
+    ({                                                                             \
+        int8 weights_values0 = 0;                                                  \
+        int  weights_value1  = 0;                                                  \
+        weights_values0.s0   = convert_int(*(weights_ptr + 0 * weights_stride_y)); \
+        weights_values0.s1   = convert_int(*(weights_ptr + 1 * weights_stride_y)); \
+        weights_values0.s2   = convert_int(*(weights_ptr + 2 * weights_stride_y)); \
+        weights_values0.s3   = convert_int(*(weights_ptr + 3 * weights_stride_y)); \
+        weights_values0.s4   = convert_int(*(weights_ptr + 4 * weights_stride_y)); \
+        weights_values0.s5   = convert_int(*(weights_ptr + 5 * weights_stride_y)); \
+        weights_values0.s6   = convert_int(*(weights_ptr + 6 * weights_stride_y)); \
+        weights_values0.s7   = convert_int(*(weights_ptr + 7 * weights_stride_y)); \
+        weights_value1       = convert_int(*(weights_ptr + 8 * weights_stride_y)); \
+        \
+        int16 src0 = 0;                                                            \
+        int8  src1 = 0;                                                            \
+        src0.s0    = convert_int(*(src_ptr + 0 * weights_stride_y));               \
+        src0.s1    = convert_int(*(src_ptr + 1 * weights_stride_y));               \
+        src0.s2    = convert_int(*(src_ptr + 2 * weights_stride_y));               \
+        src0.s3    = convert_int(*(src_ptr + 3 * weights_stride_y));               \
+        src0.s4    = convert_int(*(src_ptr + 4 * weights_stride_y));               \
+        src0.s5    = convert_int(*(src_ptr + 5 * weights_stride_y));               \
+        src0.s6    = convert_int(*(src_ptr + 6 * weights_stride_y));               \
+        src0.s7    = convert_int(*(src_ptr + 7 * weights_stride_y));               \
+        src0.s8    = convert_int(*(src_ptr + 8 * weights_stride_y));               \
+        src0.s9    = convert_int(*(src_ptr + 9 * weights_stride_y));               \
+        src0.sA    = convert_int(*(src_ptr + 10 * weights_stride_y));              \
+        src0.sB    = convert_int(*(src_ptr + 11 * weights_stride_y));              \
+        src0.sC    = convert_int(*(src_ptr + 12 * weights_stride_y));              \
+        src0.sD    = convert_int(*(src_ptr + 13 * weights_stride_y));              \
+        src0.sE    = convert_int(*(src_ptr + 14 * weights_stride_y));              \
+        src0.sF    = convert_int(*(src_ptr + 15 * weights_stride_y));              \
+        src1.s0    = convert_int(*(src_ptr + 16 * weights_stride_y));              \
+        src1.s1    = convert_int(*(src_ptr + 17 * weights_stride_y));              \
+        src1.s2    = convert_int(*(src_ptr + 18 * weights_stride_y));              \
+        src1.s3    = convert_int(*(src_ptr + 19 * weights_stride_y));              \
+        src1.s4    = convert_int(*(src_ptr + 20 * weights_stride_y));              \
+        src1.s5    = convert_int(*(src_ptr + 21 * weights_stride_y));              \
+        src1.s6    = convert_int(*(src_ptr + 22 * weights_stride_y));              \
+        src1.s7    = convert_int(*(src_ptr + 23 * weights_stride_y));              \
+        \
+        acc += src0.s02468ACE * (int8)weights_values0.s0;                          \
+        acc += (int8)(src0.s1357, src0.s9BDF) * (int8)weights_values0.s1;          \
+        acc += (int8)(src0.s2468, src0.sACE, src1.s0) * (int8)weights_values0.s2;  \
+        acc += (int8)(src0.s3579, src0.sBDF, src1.s1) * (int8)weights_values0.s3;  \
+        acc += (int8)(src0.s468A, src0.sCE, src1.s02) * (int8)weights_values0.s4;  \
+        acc += (int8)(src0.s579, src0.sBDF, src1.s13) * (int8)weights_values0.s5;  \
+        acc += (int8)(src0.s68A, src0.sCE, src1.s024) * (int8)weights_values0.s6;  \
+        acc += (int8)(src0.s79B, src0.sDF, src1.s135) * (int8)weights_values0.s7;  \
+        acc += (int8)(src0.s8AC, src0.sE, src1.s0246) * (int8)weights_value1;      \
+    })
+
+#elif KERNEL_SIZE == 5
 
 #if STRIDE_X == 1
 #define CONVOLUTION1x5(acc, src_ptr, weights_ptr) CONVOLUTION1x5_STRIDE1(acc, src_ptr, weights_ptr)
@@ -331,7 +437,37 @@
 
     for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
     {
-#if KERNEL_SIZE == 5
+#if KERNEL_SIZE == 9
+        if(y_coord < 0)
+        {
+            const int start_z = -y_coord;
+            for(int i = start_z; i < 9; ++i)
+            {
+                CONVOLUTION1x9(values0, (src_addr + i * (int)src_stride_z), (weights_addr + i * (int)weights_stride_z));
+            }
+        }
+        else if(y_coord > (SRC_HEIGHT - 9))
+        {
+            // Avoid loading rows beyond the input height
+            const int end_z = SRC_HEIGHT - y_coord;
+            for(int i = 0; i < end_z; ++i)
+            {
+                CONVOLUTION1x9(values0, (src_addr + i * (int)src_stride_z), (weights_addr + i * (int)weights_stride_z));
+            }
+        }
+        else
+        {
+            CONVOLUTION1x9(values0, src_addr, weights_addr);
+            CONVOLUTION1x9(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z));
+            CONVOLUTION1x9(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z));
+            CONVOLUTION1x9(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z));
+            CONVOLUTION1x9(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z));
+            CONVOLUTION1x9(values0, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z));
+            CONVOLUTION1x9(values0, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z));
+            CONVOLUTION1x9(values0, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z));
+            CONVOLUTION1x9(values0, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z));
+        }
+#elif KERNEL_SIZE == 5
 #if(PAD_TOP == 1) || (PAD_BOTTM == 1)
         if(y_coord < 0) // special case Z = -1 doesn't exists
         {

diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/elementwise_operation.cl
index 9b87b52..52a3309 100644
--- a/src/core/CL/cl_kernels/elementwise_operation.cl
+++ b/src/core/CL/cl_kernels/elementwise_operation.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
index a23ae2b..eb57da8 100644
--- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
+++ b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/elementwise_unary.cl
index e8a3fb7..3e557c0 100644
--- a/src/core/CL/cl_kernels/elementwise_unary.cl
+++ b/src/core/CL/cl_kernels/elementwise_unary.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/erode.cl b/src/core/CL/cl_kernels/erode.cl
index 6576f18..810c5fc 100644
--- a/src/core/CL/cl_kernels/erode.cl
+++ b/src/core/CL/cl_kernels/erode.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/fast_corners.cl b/src/core/CL/cl_kernels/fast_corners.cl
index 76b35b9..89c144a 100644
--- a/src/core/CL/cl_kernels/fast_corners.cl
+++ b/src/core/CL/cl_kernels/fast_corners.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/fft.cl b/src/core/CL/cl_kernels/fft.cl
index 0027fd5..eb1eec5 100644
--- a/src/core/CL/cl_kernels/fft.cl
+++ b/src/core/CL/cl_kernels/fft.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/fft_digit_reverse.cl b/src/core/CL/cl_kernels/fft_digit_reverse.cl
index 040c284..200ab91 100644
--- a/src/core/CL/cl_kernels/fft_digit_reverse.cl
+++ b/src/core/CL/cl_kernels/fft_digit_reverse.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/fft_scale.cl b/src/core/CL/cl_kernels/fft_scale.cl
index bf78a26..270fb78 100644
--- a/src/core/CL/cl_kernels/fft_scale.cl
+++ b/src/core/CL/cl_kernels/fft_scale.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
index 9d6a2b8..5775d89 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/fill_border.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,7 @@
  * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
  * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
  *
- * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -106,7 +106,7 @@
  * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
  * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
  *
- * @param[out] buf_ptr                           Pointer to the source image. Supported data types: U8/U16/S16/U32/S32/F16/F32
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/flatten.cl b/src/core/CL/cl_kernels/flatten.cl
index 6418edc..a1a2e46 100644
--- a/src/core/CL/cl_kernels/flatten.cl
+++ b/src/core/CL/cl_kernels/flatten.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/floor.cl b/src/core/CL/cl_kernels/floor.cl
index e967e6b..1988ba4 100644
--- a/src/core/CL/cl_kernels/floor.cl
+++ b/src/core/CL/cl_kernels/floor.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/gather.cl
index 8400419..41f439c 100644
--- a/src/core/CL/cl_kernels/gather.cl
+++ b/src/core/CL/cl_kernels/gather.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/gaussian_pyramid.cl b/src/core/CL/cl_kernels/gaussian_pyramid.cl
index 618937f..ae2c31a 100644
--- a/src/core/CL/cl_kernels/gaussian_pyramid.cl
+++ b/src/core/CL/cl_kernels/gaussian_pyramid.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 8a95601..4ad22ec 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "gemm_helpers.h"
 #include "repeat.h"
 
-#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
 #define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
 #define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
 #define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
@@ -43,13 +43,42 @@
     ({})
 #endif // (SRC_WIDTH % K0)
 
+#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                     \
+    ({                                                                                                           \
+        if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0)                                                    \
+        {                                                                                                        \
+            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
+            {                                                                                                    \
+                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
+            }                                                                                                    \
+            else                                                                                                 \
+            {                                                                                                    \
+                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
+            }                                                                                                    \
+        }                                                                                                        \
+        else                                                                                                     \
+        {                                                                                                        \
+            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
+            {                                                                                                    \
+                LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
+            }                                                                                                    \
+            else                                                                                                 \
+            {                                                                                                    \
+                LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);                           \
+            }                                                                                                    \
+        }                                                                                                        \
+    })
+
 /** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
  *  the output matrix unrolling the values.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
  * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
  * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
  * @note Only the following values for M0, K0 and V0 are supported:
  *                                      M0: 2,3,4,5,6,7,8
  *                                      K0: 2,3,4,8,16
@@ -61,7 +90,7 @@
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
  *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
@@ -141,29 +170,10 @@
 
     // ---------------------------Load input values --------------------------------
     // Load values from the LHS matrix
-    LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
-    BOUNDARY_CONDITION_X(x, a0);
-#if M0 > 1
-    BOUNDARY_CONDITION_X(x, a1);
-#endif // M0 > 1
-#if M0 > 2
-    BOUNDARY_CONDITION_X(x, a2);
-#endif // M0 > 2
-#if M0 > 3
-    BOUNDARY_CONDITION_X(x, a3);
-#endif // M0 > 3
-#if M0 > 4
-    BOUNDARY_CONDITION_X(x, a4);
-#endif // M0 > 4
-#if M0 > 5
-    BOUNDARY_CONDITION_X(x, a5);
-#endif // M0 > 5
-#if M0 > 6
-    BOUNDARY_CONDITION_X(x, a6);
-#endif // M0 > 6
-#if M0 > 7
-    BOUNDARY_CONDITION_X(x, a7);
-#endif // M0 > 7
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
+
+    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
+
     // ---------------------------Store output values ------------------------------
     REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
     STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
@@ -248,8 +258,11 @@
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
  * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
  * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
  * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
  * @note Only the following values for M0, K0 and V0 are supported:
  *                                      M0: 2,3,4,5,6,7,8
  *                                      K0: 2,3,4,8,16
@@ -261,7 +274,7 @@
  *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
  * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
  *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
@@ -340,31 +353,10 @@
     output_ptr += z * (uint)dst_stride_z;
 
     // ---------------------------Load input values --------------------------------
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
 
-    // Load values from the LHS matrix
-    LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
-    BOUNDARY_CONDITION_X(x, a0);
-#if M0 > 1
-    BOUNDARY_CONDITION_X(x, a1);
-#endif // M0 > 1
-#if M0 > 2
-    BOUNDARY_CONDITION_X(x, a2);
-#endif // M0 > 2
-#if M0 > 3
-    BOUNDARY_CONDITION_X(x, a3);
-#endif // M0 > 3
-#if M0 > 4
-    BOUNDARY_CONDITION_X(x, a4);
-#endif // M0 > 4
-#if M0 > 5
-    BOUNDARY_CONDITION_X(x, a5);
-#endif // M0 > 5
-#if M0 > 6
-    BOUNDARY_CONDITION_X(x, a6);
-#endif // M0 > 6
-#if M0 > 7
-    BOUNDARY_CONDITION_X(x, a7);
-#endif // M0 > 7
+    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
+
     // ---------------------------Transpose and store block -----------------------
 
     TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
@@ -396,7 +388,7 @@
 #undef OUTPUT_OFFSET_X
 #undef OUTPUT_STEP_X
 }
-#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
 
 #if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
 /** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
@@ -412,7 +404,7 @@
  *                                      K0: 1,2,3,4,8,16
  *                                      H0: greater than 0
  *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
@@ -566,7 +558,7 @@
  *                                      K0: 2,3,4,8,16
  *                                      H0: greater than 0
  *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
@@ -1016,6 +1008,8 @@
  * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
  * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
  *  - N0 = 2, 3, 4, 8, 16
@@ -1110,7 +1104,7 @@
 #endif // defined(DUMMY_WORK_ITEMS)
 
     // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
 
     // Compute RHS reshaped matrix address
     uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
@@ -1226,7 +1220,7 @@
         rhs_offset += sizeof(DATA_TYPE);
     }
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
 
     REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
 
@@ -1266,8 +1260,7 @@
     ADD_BLOCK_BROADCAST(M0, c, bias0);
 
 #else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
 
     LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
 
@@ -1285,102 +1278,438 @@
     ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
     // Store output block
-    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
 
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
+                                                  __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                  IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                  IMAGE_DECLARATION(dst),
+                                                  uint lhs_stride_z,
+                                                  uint rhs_stride_z,
+#if defined(BETA)
+                                                  uint bias_stride_z,
+#endif //defined(BETA)
+                                                  uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                  ,
+                                                  uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                  ,
+                                                  uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                 )
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
+
+#define LEFTOVER_K (K % K0)
+
+    // Block size
+#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X (PIXEL_UNIT * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X PIXEL_UNIT
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = get_global_id(2);
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix stored in a cl_image
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+#if LEFTOVER_K != 0
+    // Note: We cannot read out-of-bound elements from the RHS matrix because
+    // the RHS width is always multiple of K0. This is not be true for the LHS matrix
+
+    union UNION_VEC_TYPE
+    {
+        DATA_TYPE s[K0];
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        v;
+    };
+
+    union UNION_VEC_TYPE a0 = {.v = 0 };
+#if M0 > 1
+    union UNION_VEC_TYPE a1 = {.v = 0 };
+#endif // M0 > 1
+#if M0 > 2
+    union UNION_VEC_TYPE a2 = {.v = 0 };
+#endif // M0 > 2
+#if M0 > 3
+    union UNION_VEC_TYPE a3 = {.v = 0 };
+#endif // M0 > 3
+#if M0 > 4
+    union UNION_VEC_TYPE a4 = {.v = 0 };
+#endif // M0 > 4
+#if M0 > 5
+    union UNION_VEC_TYPE a5 = {.v = 0 };
+#endif // M0 > 5
+#if M0 > 6
+    union UNION_VEC_TYPE a6 = {.v = 0 };
+#endif // M0 > 6
+#if M0 > 7
+    union UNION_VEC_TYPE a7 = {.v = 0 };
+#endif // M0 > 7
+
+    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+
+    // Load from RHS matrix
+    LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+    // Load from LHS matrix
+    for(int k = 0; k < LEFTOVER_K; ++k)
+    {
+        a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
+#if M0 > 1
+        a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
+#endif // M0 > 1
+#if M0 > 2
+        a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
+#endif // M0 > 2
+#if M0 > 3
+        a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
+#endif // M0 > 3
+#if M0 > 4
+        a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
+#endif // M0 > 4
+#if M0 > 5
+        a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
+#endif // M0 > 5
+#if M0 > 6
+        a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
+#endif // M0 > 6
+#if M0 > 7
+        a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
+#endif // M0 > 7
+
+        lhs_offset += sizeof(DATA_TYPE);
+    }
+
+    // Accumulate
+    ARM_DOT_K0XN0(K0, a0.v, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(K0, a1.v, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(K0, a2.v, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(K0, a3.v, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(K0, a4.v, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(K0, a5.v, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(K0, a6.v, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(K0, a7.v, b, c7);
+#endif // M0 > 7
+
+#endif // LEFTOVER_K != 0
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef LEFTOVER_K
+#undef PIXEL_UNIT
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
+
 #define VFMA(a, b, c)     \
     ({                    \
         c = fma(a, b, c); \
     })
 
 #if M0 == 1
-#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
-    ({                                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
     })
 #elif M0 == 2 // M0 == 2
-#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
-    ({                                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
     })
 #elif M0 == 3 // M0 == 3
-#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
-    ({                                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
     })
 #elif M0 == 4 // M0 == 4
-#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
-    ({                                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
     })
 #elif M0 == 5 // M0 == 5
-#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
-    ({                                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
     })
 #elif M0 == 6 // M0 == 6
-#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
-    ({                                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                            \
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
     })
 #elif M0 == 7 // M0 == 7
-#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
-    ({                                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                            \
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
     })
 #elif M0 == 8 // M0 == 8
-#define LD_RHS_VFMA_M0xN0(i, a, c)                                                                               \
-    ({                                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, N0)                                                                             \
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                            \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7));                                            \
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
     })
 #else // M0 not supported
 #error "M0 not supported"
@@ -1396,6 +1725,8 @@
  * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
  * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
  *  - N0 = 2, 3, 4, 8, 16
@@ -1490,7 +1821,7 @@
 #endif // defined(DUMMY_WORK_ITEMS)
 
     // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
 
     // Compute RHS reshaped matrix address
     uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
@@ -1539,29 +1870,48 @@
         // Load values from LHS matrix
         LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
 
-        LD_RHS_VFMA_M0xN0(0, a, c);
-        LD_RHS_VFMA_M0xN0(1, a, c);
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(0, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(1, a, b0, c);
 #if K0 > 2
-        LD_RHS_VFMA_M0xN0(2, a, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(2, a, b0, c);
 #endif // K0 > 2
 #if K0 > 3
-        LD_RHS_VFMA_M0xN0(3, a, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(3, a, b0, c);
 #endif // K0 > 3
 #if K0 > 4
-        LD_RHS_VFMA_M0xN0(4, a, c);
-        LD_RHS_VFMA_M0xN0(5, a, c);
-        LD_RHS_VFMA_M0xN0(6, a, c);
-        LD_RHS_VFMA_M0xN0(7, a, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(4, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(5, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(6, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(7, a, b0, c);
 #endif // K0 > 4
 #if K0 > 8
-        LD_RHS_VFMA_M0xN0(8, a, c);
-        LD_RHS_VFMA_M0xN0(9, a, c);
-        LD_RHS_VFMA_M0xN0(A, a, c);
-        LD_RHS_VFMA_M0xN0(B, a, c);
-        LD_RHS_VFMA_M0xN0(C, a, c);
-        LD_RHS_VFMA_M0xN0(D, a, c);
-        LD_RHS_VFMA_M0xN0(E, a, c);
-        LD_RHS_VFMA_M0xN0(F, a, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(8, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(9, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(A, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(B, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(C, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(D, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(E, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(F, a, b0, c);
 #endif // K0 > 8
 
         lhs_offset += K0 * sizeof(DATA_TYPE);
@@ -1603,13 +1953,17 @@
         a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
 #endif // M0 > 7
 
-        LD_RHS_VFMA_M0xN0(0, a, c);
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(0, a, b0, c);
 
         lhs_offset += sizeof(DATA_TYPE);
         rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
     }
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
 
     REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
 
@@ -1648,8 +2002,7 @@
     ADD_BLOCK_BROADCAST(M0, c, bias0);
 
 #else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
 
     LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
 
@@ -1667,13 +2020,326 @@
     ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
     // Store output block
-    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
+
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
+                                                   __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                   IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                   IMAGE_DECLARATION(dst),
+                                                   uint lhs_stride_z,
+                                                   uint rhs_stride_z,
+#if defined(BETA)
+                                                   uint bias_stride_z,
+#endif //defined(BETA)
+                                                   uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                   ,
+                                                   uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                   ,
+                                                   uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                  )
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (PIXEL_UNIT)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (z % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(0, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(1, a, b0, c);
+#if K0 > 2
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(2, a, b0, c);
+#endif // K0 > 2
+#if K0 > 3
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(3, a, b0, c);
+#endif // K0 > 3
+#if K0 > 4
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(4, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(5, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(6, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(7, a, b0, c);
+#endif // K0 > 4
+#if K0 > 8
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(8, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(9, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(A, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(B, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(C, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(D, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(E, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(F, a, b0, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+
+        VFMA_M0xN0(0, a, b0, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        x_rhs += RHS_STEP_X;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
 #endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
 
 #if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
@@ -1859,12 +2525,14 @@
  * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
  * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 and -DN=90).
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
  * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
  * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
  * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
  * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 = 2, 3, 4, 5, 6, 7, 8
  *  - N0 = 2, 3, 4, 8, 16
@@ -2101,11 +2769,15 @@
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
     // Store output block
 #if defined(MIXED_PRECISION)
-    CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #else  // defined(MIXED_PRECISION)
-    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #endif // defined(MIXED_PRECISION)
 
 #undef LHS_BLOCK_SIZE
@@ -2114,8 +2786,282 @@
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
 }
 
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
+                                                    __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                    IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                    IMAGE_DECLARATION(dst),
+                                                    uint k,
+                                                    uint lhs_stride_z,
+                                                    uint rhs_stride_z,
+#if defined(BETA)
+                                                    uint bias_stride_z,
+#endif //defined(BETA)
+                                                    uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                    ,
+                                                    uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                   )
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
+
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X (PIXEL_UNIT * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X PIXEL_UNIT
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = get_global_id(2);
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+        // Load values from RHS matrix stored in a cl_image
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+
+        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef PIXEL_UNIT
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
+
 #if defined(LHS_TRANSPOSE)
 
 #define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
@@ -2232,12 +3178,14 @@
  *
  * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
  * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52 and -DN=90).
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
  * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
  * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
  * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
  * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
  * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 = 2, 3, 4, 8
  *  - N0 = 2, 3, 4, 8, 16
@@ -2363,8 +3311,11 @@
     for(int i = 0; i < k; i += K0)
     {
         VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0 = VLOAD(M0)(0, lhs);
+        a0;
         VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        a0 = VLOAD(M0)(0, lhs);
         b0 = VLOAD(N0)(0, rhs);
 
         ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
@@ -2555,6 +3506,374 @@
 #endif // defined(MIXED_PRECISION)
 
 #else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
+                                                    __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                    IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                    IMAGE_DECLARATION(dst),
+                                                    uint k,
+                                                    uint lhs_stride_z,
+                                                    uint rhs_stride_z,
+#if defined(BETA)
+                                                    uint bias_stride_z,
+#endif //defined(BETA)
+                                                    uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                    ,
+                                                    uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                   )
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
+
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (PIXEL_UNIT)
+#endif // defined(RHS_INTERLEAVE)
+
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (z % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, M0)
+        a0;
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+#if K0 > 1
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+        lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+        x_rhs += K0 * RHS_STEP_X;
+#ifndef RHS_INTERLEAVE
+        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
     __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
 
     LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
@@ -2581,11 +3900,15 @@
 #endif // defined(MIXED_PRECISION)
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
     // Store output block
 #if defined(MIXED_PRECISION)
-    CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #else  // defined(MIXED_PRECISION)
-    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 #endif // defined(MIXED_PRECISION)
 
 #undef LHS_BLOCK_SIZE
@@ -2594,7 +3917,11 @@
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
+#undef PIXEL_UNIT
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
 }
+#endif // defined(OPENCL_IMAGE_SUPPORT)
 
 #endif // defined(LHS_TRANSPOSE)
 
@@ -2689,6 +4016,8 @@
  * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
  * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
  * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
  * @note Only the following configurations of M0, N0 and K0 are currently supported:
  *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
  *  - N0 = 2, 3, 4, 8, 16
@@ -2774,7 +4103,7 @@
 #endif // defined(DUMMY_WORK_ITEMS)
 
     // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
 
     // Compute RHS matrix address
     uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
@@ -2897,7 +4226,7 @@
         rhs_offset += rhs_stride_y;
     }
 
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
 
     REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
 
@@ -2936,8 +4265,7 @@
     ADD_BLOCK_BROADCAST(M0, c, bias0);
 
 #else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
 
     LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
 
@@ -2955,8 +4283,11 @@
     ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
 #endif // defined(ACTIVATION_TYPE)
 
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
     // Store output block
-    STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, cond_y, cond_x);
 
 #undef RHS_BLOCK_SIZE
 #undef RHS_OFFSET_X
@@ -6322,39 +7653,3 @@
     vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
 }
 #endif // defined(WIDTH_VECTOR_A)
-
-/** This kernel accumulates each row with the biases vector.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
- *
- * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported data type: U8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X dimension (in bytes)
- * @param[in]      accum_step_x                         accum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y dimension (in bytes)
- * @param[in]      accum_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the accumulate tensor
- * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p accum_ptr
- * @param[in]      biases_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]      biases_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-#if defined(DATA_TYPE) && defined(VECTOR_SIZE)
-__kernel void gemm_accumulate_biases(
-    IMAGE_DECLARATION(accum),
-    VECTOR_DECLARATION(biases))
-{
-    Image  accum  = CONVERT_TO_IMAGE_STRUCT(accum);
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-
-    // Vector size, e.g. number of vector elements.
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
-    accum_value  = biases_value + accum_value;
-    // Store result in the accumulate buffer
-    VSTORE(VECTOR_SIZE)
-    (accum_value, 0, (__global DATA_TYPE *)accum.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE)

diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index af43477..6f6edc1 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,268 @@
 #include "activation_float_helpers.h"
 #include "helpers.h"
 
+/** Utility macro to access a vector with the scalar positions
+ *
+ * Supported cases are: Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
+ *
+ * @param[in] offset The offset within the vector. Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
+ * @param[in] n0     The number of consecutive columns to access. n0 + offset must be <= 16
+ * @param[in] x      Vector to access
+ * @{
+ */
+#define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
+#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
+
+// offset == 0
+#define scalar_access_0_1(x) ((x).s0)
+#define scalar_access_0_2(x) ((x).s01)
+#define scalar_access_0_3(x) ((x).s012)
+#define scalar_access_0_4(x) ((x).s0123)
+#define scalar_access_0_8(x) ((x).s01234567)
+#define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
+
+// offset == 1
+#define scalar_access_1_1(x) ((x).s1)
+#define scalar_access_1_2(x) ((x).s12)
+#define scalar_access_1_3(x) ((x).s123)
+#define scalar_access_1_4(x) ((x).s1234)
+#define scalar_access_1_8(x) ((x).s12345678)
+
+// offset == 2
+#define scalar_access_2_1(x) ((x).s2)
+#define scalar_access_2_2(x) ((x).s23)
+#define scalar_access_2_3(x) ((x).s234)
+#define scalar_access_2_4(x) ((x).s2345)
+#define scalar_access_2_8(x) ((x).s23456789)
+
+// offset == 3
+#define scalar_access_3_1(x) ((x).s3)
+#define scalar_access_3_2(x) ((x).s34)
+#define scalar_access_3_3(x) ((x).s345)
+#define scalar_access_3_4(x) ((x).s3456)
+#define scalar_access_3_8(x) ((x).s3456789A)
+
+// offset == 4
+#define scalar_access_4_1(x) ((x).s4)
+#define scalar_access_4_2(x) ((x).s45)
+#define scalar_access_4_3(x) ((x).s456)
+#define scalar_access_4_4(x) ((x).s4567)
+#define scalar_access_4_8(x) ((x).s456789AB)
+
+// offset == 8
+#define scalar_access_8_1(x) ((x).s8)
+#define scalar_access_8_2(x) ((x).s89)
+#define scalar_access_8_3(x) ((x).s89A)
+#define scalar_access_8_4(x) ((x).s89AB)
+#define scalar_access_8_8(x) ((x).s89ABCDEF)
+
+// offset == 12
+#define scalar_access_12_1(x) ((x).sC)
+#define scalar_access_12_2(x) ((x).sCD)
+#define scalar_access_12_3(x) ((x).sCDE)
+#define scalar_access_12_4(x) ((x).sCDEF)
+
+// offset == 16
+#define scalar_access_16_1(x) ((x).sF)
+
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) without allocating variables.
+ * @name LOAD_TENSOR_ROW_n
+ *
+ * @param[in] N0         The number of columns to load
+ * @param[in] DATA_TYPE  The data type of variables
+ * @param[in] BASENAME   The basename of the destination variables for the loaded rows
+ * @param[in] PTR        The base pointer
+ * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16
+ * @param[in] STRIDE_Y   The stride value in y-axis direction
+ * @param[in] Z          The z-axis offset vector
+ * @{
+ */
+#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    ({})
+
+#define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)      \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)     \
+    SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @}*/ // end of group LOAD_TENSOR_ROW_n
+
+/** Load tensor (consecutive rows and columns) with Z offset.
+ * @name LOAD_TENSOR
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
+ *
+ * @param[in] M0         The number of consecutive rows
+ * @param[in] N0         The number of consecutive columns
+ * @param[in] DATA_TYPE  The data type of the target
+ * @param[in] BASENAME   The basename of the result variables
+ * @param[in] PTR        The base pointer for the data
+ * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16
+ * @param[in] STRIDE_Y   The stride in y-axis direction
+ * @param[in] Z          The z-axis offset vector
+ * @{
+ */
+#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+/** @} */ // end of group LOAD_TENSOR
+
+/** Load 2D tensor (consecutive rows and columns) with Z offset.
+ * @name LOAD_TENSOR_M0Xn
+ *
+ * @param[in] M0        The number of rows to load [0-16]
+ * @param[in] N0        The number of columns to load [0-16]
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    ({})
+
+#define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
+    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
+    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);       \
+    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);        \
+    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
+    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
+    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);        \
+    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
+    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
+    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
+    LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
+    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin);                          \
+    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
+    LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                  \
+    LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);                         \
+    LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
+    LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
+
+#define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
+    LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+/** @}*/ // end of group LOAD_TENSOR_M0Xn
+
+/** Load 2D tensor (consecutive rows and columns) with Z offset.
+ * @name LOAD_TENSOR_M0XN0
+ *
+ * @param[in] M0        The number of consecutive rows [0-16]
+ * @param[in] N0        The number of consecutive columns [0-16]
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_ROW_n
  *
- * @param[in] N0        The number of rows to load
+ * @param[in] N0        The number of columns to load
  * @param[in] DATA_TYPE The data type of variables
  * @param[in] BASENAME  The basename of the destination variables for the loaded rows
  * @param[in] PTR       The base pointer
@@ -140,6 +398,105 @@
 #define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_BLOCK
 
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_TEXTURE2D_ROW_n
+ *
+ * @param[in] N0         The number of pixels to read
+ * @param[in] DATA_TYPE  The data type of variables
+ * @param[in] BASENAME   The basename of the destination variables for the loaded rows
+ * @param[in] IMG        The 2D OpenCL image object
+ * @param[in] X_COORD    The x coordinate for the top-left pixel
+ * @param[in] Y_COORD    The y coordinate for the top-left pixel
+ * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels)
+ * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
+ * @{
+ */
+#define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)      \
+    BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
+
+#define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)     \
+    BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
+/** @} */ // end of group LOAD_TEXTURE2D_ROW_n
+
+/** Load a 2D texture in unit of pixel. A pixel is made of 4 floating point values
+ * @name LOAD_TEXTURE2D
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ *
+ * @param[in] M0         The number of consecutive rows
+ * @param[in] N0         The number of consecutive pixels. Only 1, 2 and 4 are supported
+ * @param[in] DATA_TYPE  The data type of the target
+ * @param[in] BASENAME   The basename of the result variables
+ * @param[in] IMG        The 2D OpenCL image object
+ * @param[in] X_COORD    The x coordinate for the top-left pixel
+ * @param[in] Y_COORD    The y coordinate for the top-left pixel
+ * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels)
+ * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
+ * @{
+ */
+#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+/** @} */ // end of group LOAD_TEXTURE2D
+
 /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
  * @name LOAD_ELEMENT_n
  *
@@ -354,7 +711,7 @@
 /** Store the 0 to (n-1)th rows of the given variables
  * @name STORE_ROW_n
  *
- * @param[in] N0        The size of the vectors
+ * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
  * @param[in] DATA_TYPE The data type of the vectors
  * @param[in] BASENAME  The basename of the variables
  * @param[in] PTR       The base pointer
@@ -442,6 +799,101 @@
     (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
 /** @} */ // end of groupd STORE_ROW_n
 
+/** Partially store the 0 to (n-1)th rows of the given variables
+ * @name STORE_ROW_PARTIAL_n
+ * Within each row, store the lower @p STORE_N0 elements of vectors of width @p N0
+ *
+ * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * @param[in] N0        The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] STORE_N0  The **lower** size of the vectors to store. Supported: [1-16 and <= @p N0
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_1(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_2(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_3(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_4(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_5(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_6(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_7(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_8(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                 \
+    (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_9(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)      \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_10(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_11(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_12(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_13(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_14(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_PARTIAL_16(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_15(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)     \
+    VSTORE_PARTIAL(N0, STORE_N0)                                                  \
+    (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of groupd STORE_ROW_PARTIAL_n
+
 /** Convert and store the 0th to (n-1)th rows of the given variables
  * @name CONVERT_STORE_ROW_n
  *
@@ -556,6 +1008,127 @@
 #define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group STORE_BLOCK
 
+/** Partially store a block of the given size STORE_M0xSTORE_N0
+ * @name STORE_BLOCK_PARTIAL
+ *
+ * @note The vector width @p N0 is also required for correct partial storing behaviour.
+ * @note in case @p STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for STORE_M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for STORE_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] STORE_M0  The number of rows to store. Supported: 1-16
+ * @param[in] STORE_N0  The lower number of elements of vectors to store. Supported: 1-16 and <= @p N0
+ * @param[in] N0        The size of each vector. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** Store a block that can be partial in both x and y dimensions
+ *
+ * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
+ * @param[in] N                Total number of columns. Used to detect if current block is at the boundary in x.
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
+ */
+#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                               \
+    {                                                                                                                                        \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                              \
+    }                                                                                                                                        \
+    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                           \
+    {                                                                                                                                        \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                \
+    }                                                                                                                                        \
+    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                           \
+    {                                                                                                                                        \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                \
+    }                                                                                                                                        \
+    else                                                                                                                                     \
+    {                                                                                                                                        \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                  \
+    }
+/** Store a block that can only be partial in x but not y.
+ *
+ * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
+ * @param[in] N                Total number of columns. Used to detect if current block is at the boundary in x.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
+ */
+#define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, PARTIAL_COND_X) \
+    if(!(PARTIAL_COND_X))                                                                                            \
+    {                                                                                                        \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                              \
+    }                                                                                                        \
+    else                                                                                                     \
+    {                                                                                                        \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                \
+    }
+/** Store a block that can only be partial in y but not x.
+ *
+ * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
+ */
+#define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
+    if(!(PARTIAL_COND_Y))                                                                                         \
+    {                                                                                                     \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                           \
+    }                                                                                                     \
+    else                                                                                                  \
+    {                                                                                                     \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);             \
+    }
+/** @} */ // end of group STORE_BLOCK_PARTIAL
+
 /** Convert and store a block of the given size M0xN0
  * @name CONVERT_STORE_BLOCK
  *
@@ -1159,4 +1732,113 @@
  */
 #define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
 #define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-/** @} */ // end of group CONVERT_BLOCK
\ No newline at end of file
+/** @} */ // end of group CONVERT_BLOCK
+
+#if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+
+/** Boundary-aware GEMM block store
+ * @name STORE_BLOCK_BOUNDARY_AWARE
+ * This macro assumes the following schemes to achieve boundary-awareness:
+ *  - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
+ *  - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
+ *  - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
+ * The macro then ensures that the dst tensor can be stored without any paddings in both x and y dim.
+ *
+ * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
+ * blocks **at the end**.
+ * Say, the dst tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
+ * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
+ *
+ *  *--x-->                         x == 0                        x == 1
+ *  |                  |<------------------------------N-------------------------->|
+ *  y                  |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
+ *  |     -------------#############################################################
+ *  *     |          | |...............................|...........................|
+ * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
+ *        |          | |...............................|...........................|
+ *        M          --#############################################################
+ *        |          | |                               |...........................|
+ * y == 1 |         M0 |      Non-boundary block       |....Boundary block in x....|
+ *        |          | |                               |...........................|
+ *        |------------#############################################################
+ *
+ * Then @p PARTIAL_STORE_M0 = M % M0      and @p PARTIAL_STORE_N0 = N % N0
+ *
+ * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
+ *
+ * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
+ * and select corresponding store methods such that the boundary detection logic is only added when needed.
+ *
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] N0               The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
+ * @param[in] DATA_TYPE        The data type of the vectors
+ * @param[in] BASENAME         The basename of the variables
+ * @param[in] PTR              The base pointer
+ * @param[in] STRIDE_Y         The stride value in y-axis direction
+ * @param[in] Z                The offset in z-axis direction
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
+ * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
+ * @param[in] N                Total number of columns. Used to detect if current block is at the boundary in x.
+ * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
+ * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
+ * @{
+ */
+#if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+// Case1: No partial blocks in either x or y
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+
+#elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
+// Case2: Partial blocks in y
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
+
+#elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
+// Case3: Partial blocks in x
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, N, PARTIAL_COND_X)
+
+#else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+// Case4: Partial blocks in both x and y
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X)
+
+#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+
+#else // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, PARTIAL_COND_Y, PARTIAL_COND_X) \
+    STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+
+#endif    // defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+/** @} */ // end of group STORE_BLOCK_BOUNDARY_AWARE
+
+#if defined(PARTIAL_STORE_M0)
+/** Compute the start m0 row (LHS, BIAS and DST) in a boundary-aware way so as to avoid padding
+ * @name COMPUTE_M0_START_ROW
+ * If there're any partial blocks in y dimension, they are placed at the beginning of the rows.
+ * This shift amount is added to all rows such that the partial block (at the beginning) overlaps with the subsequent
+ * blocks in the y dimension to avoid any padding.
+ * EG: M0=4, PARTIAL_STORE_M0=1:
+ *                  | Non-overlapping | +M0_ROW_SHIFT (Overlapping)
+ * block 0 (partial)| start row = 0   | start row = 0
+ * block 1 (full)   | start row = 4   | start row = 1
+ * block 2 (full)   | start row = 8   | start row = 5
+ *
+ * @param[in] y                Global id of current block in y.
+ * @param[in] M0               The number of rows to store, for non-partial blocks. Supported: 1-16
+ * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
+ * @{
+ */
+#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
+    ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
+#else // defined(PARTIAL_STORE_M0)
+#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
+    ((uint)(y * M0))
+#endif    // defined(PARTIAL_STORE_M0)
+/** @} */ // end of group COMPUTE_M0_START_ROW

diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index dd09289..b4ac005 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -835,11 +835,6 @@
     // Convert result of matrix multiplication to S32
     REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_int);
 
-    int batch_id = z;
-#if defined(DEPTH_GEMM3D)
-    batch_id /= (int)DEPTH_GEMM3D;
-#endif // defined(DEPTH_GEMM3D)
-
     // Offset contribution: c += (A_OFFSET * sum_col) + (B_OFFSET * sum_row) +  K_OFFSET;
     REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(int, N0), offset_s32_, K_OFFSET);
 
@@ -859,11 +854,11 @@
 
 #if defined(B_OFFSET)
     // Compute the offset contribution due to B_OFFSET
+    // Note: The sum_row tensor is generated through CLGEMMLowpMatrixAReductionKernel which
+    // does not introduce paddings. For this reason is safe to access the tensor in this manner
+    // without considering that the coordinate "y" could come from an input 3D tensor
     __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + (y * (uint)M0) * sizeof(int) + z * sum_row_stride_y;
 
-#if defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D)
-    sum_row_addr += (batch_id % (int)DEPTH_GEMM3D) * (int)HEIGHT_GEMM3D * sizeof(int);
-#endif // defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D)
     LOAD_SCALAR_AS_VECTOR(M0, N0, int, b_offset_s32_, sum_row_addr, 0, sum_row_stride_x);
 
     REPEAT_MLA_VAR_WITH_CONST_VEC(M0, offset_s32_, b_offset_s32_, (VEC_DATA_TYPE(int, N0))B_OFFSET);
@@ -1115,7 +1110,7 @@
  * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
  * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1180,7 +1175,7 @@
  * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
  * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -1253,7 +1248,7 @@
  * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
  * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. -DSCALAR=3)
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -2241,4 +2236,4 @@
     // Store the result
     vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
 }
-#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
\ No newline at end of file
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)

diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl
index aabde41..aaa8397 100644
--- a/src/core/CL/cl_kernels/gemv.cl
+++ b/src/core/CL/cl_kernels/gemv.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/generate_proposals.cl
index a947dad..e8306c5 100644
--- a/src/core/CL/cl_kernels/generate_proposals.cl
+++ b/src/core/CL/cl_kernels/generate_proposals.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/generate_proposals_quantized.cl b/src/core/CL/cl_kernels/generate_proposals_quantized.cl
index 690d1cf..0426419 100644
--- a/src/core/CL/cl_kernels/generate_proposals_quantized.cl
+++ b/src/core/CL/cl_kernels/generate_proposals_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/harris_corners.cl b/src/core/CL/cl_kernels/harris_corners.cl
index 5320a06..3e3c9fd 100644
--- a/src/core/CL/cl_kernels/harris_corners.cl
+++ b/src/core/CL/cl_kernels/harris_corners.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index c4cbf77..7b08233 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -194,6 +194,49 @@
 #define VLOAD_STR(size) vload##size
 #define VLOAD(size) VLOAD_STR(size)
 
+#define PIXEL_UNIT4 1
+#define PIXEL_UNIT8 2
+#define PIXEL_UNIT16 4
+
+/** Utility macro to convert a vector size in pixel unit.
+ *
+ * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
+ *
+ * @param[in] vec_size Vector size. Only 4,8 and 16 is supported
+ *
+ * @return The pixel unit (number of pixels)
+ * @{
+ */
+#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
+#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
+/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
+
+#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
+#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
+#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
+
+/** Utility macro to read a 2D OpenCL image object.
+ *
+ * @note Coordinates are not normalized
+ *
+ * @param[in] data_type Data type
+ * @param[in] n0        Number of pixel to read. Only 1,2 and 4 is supported
+ * @param[in] img       OpenCL image object
+ * @param[in] x_coord   The x coordinate for the top-left pixel
+ * @param[in] y_coord   The y coordinate for the top-left pixel
+ *
+ * @return Pixels from the 2D OpenCL image object
+ * @{
+ */
+#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
+#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
+
 #define VSTORE_STR(size) vstore##size
 #define VSTORE(size) VSTORE_STR(size)
 
@@ -212,6 +255,142 @@
 #define vload1(OFFSET, PTR) *(OFFSET + PTR)
 #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
 
+/** Extended partial vstore that correctly handles scalar values as well.
+ * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
+ * @name VSTORE_PARTIAL
+ *
+ * @note With this macro, the passed data can be both a vector and a scalar
+ * @note @p store_size needs to be <= @p size
+ * eg 1: Valid
+ * VSTORE_PARTIAL(16, 15) ...;
+ * eg 2: Invalid
+ * VSTORE_PARTIAL(4, 7) ...;
+ *
+ * @param[in] size       The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size
+ * @{
+ */
+#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
+#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
+
+// Size == 1 (scalar)
+#define vstore_partial_1_1 vstore1
+// Size == 2
+#define vstore_partial_2_1 vstore_partial_1
+#define vstore_partial_2_2 vstore_partial_2
+// Size == 3
+#define vstore_partial_3_1 vstore_partial_1
+#define vstore_partial_3_2 vstore_partial_2
+#define vstore_partial_3_3 vstore_partial_3
+// Size == 4
+#define vstore_partial_4_1 vstore_partial_1
+#define vstore_partial_4_2 vstore_partial_2
+#define vstore_partial_4_3 vstore_partial_3
+#define vstore_partial_4_4 vstore_partial_4
+// Size == 8
+#define vstore_partial_8_1 vstore_partial_1
+#define vstore_partial_8_2 vstore_partial_2
+#define vstore_partial_8_3 vstore_partial_3
+#define vstore_partial_8_4 vstore_partial_4
+#define vstore_partial_8_5 vstore_partial_5
+#define vstore_partial_8_6 vstore_partial_6
+#define vstore_partial_8_7 vstore_partial_7
+#define vstore_partial_8_8 vstore_partial_8
+// Size == 16
+#define vstore_partial_16_1 vstore_partial_1
+#define vstore_partial_16_2 vstore_partial_2
+#define vstore_partial_16_3 vstore_partial_3
+#define vstore_partial_16_4 vstore_partial_4
+#define vstore_partial_16_5 vstore_partial_5
+#define vstore_partial_16_6 vstore_partial_6
+#define vstore_partial_16_7 vstore_partial_7
+#define vstore_partial_16_8 vstore_partial_8
+#define vstore_partial_16_9 vstore_partial_9
+#define vstore_partial_16_10 vstore_partial_10
+#define vstore_partial_16_11 vstore_partial_11
+#define vstore_partial_16_12 vstore_partial_12
+#define vstore_partial_16_13 vstore_partial_13
+#define vstore_partial_16_14 vstore_partial_14
+#define vstore_partial_16_15 vstore_partial_15
+#define vstore_partial_16_16 vstore_partial_16
+
+/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
+ * @name vstore_partial_n
+ *
+ * @note @p DATA needs to be a vector not a scalar
+ * @note n needs to be <= the vector width of the input variable @p DATA
+ * eg 1: Valid
+ * vstore_partial_15(var:float16, 0, 0xabcd);
+ * eg 2: Invalid
+ * vstore_partial_7(var:float4, 0, 0xabcd);
+ *
+ * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty.
+ *
+ * @param[in] DATA   The name of the variable
+ * @param[in] OFFSET Offset in n
+ * @param[in] PTR    The base pointer
+ * @{
+ */
+#define vstore_partial_1(DATA, OFFSET, PTR) \
+    vstore1(DATA.s0, OFFSET, PTR);
+
+#define vstore_partial_2(DATA, OFFSET, PTR) \
+    vstore2(DATA.s01, OFFSET, PTR);
+
+#define vstore_partial_3(DATA, OFFSET, PTR) \
+    vstore3(DATA.s012, OFFSET, PTR);
+
+#define vstore_partial_4(DATA, OFFSET, PTR) \
+    vstore4(DATA.s0123, OFFSET, PTR);
+
+#define vstore_partial_5(DATA, OFFSET, PTR)    \
+    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
+    vstore1(DATA.s4, OFFSET, PTR + 4);
+
+#define vstore_partial_6(DATA, OFFSET, PTR)    \
+    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
+    vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
+
+#define vstore_partial_7(DATA, OFFSET, PTR)    \
+    vstore_partial_4(DATA.s0123, OFFSET, PTR); \
+    vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
+
+#define vstore_partial_8(DATA, OFFSET, PTR) \
+    vstore8(DATA.s01234567, OFFSET, PTR);
+
+#define vstore_partial_9(DATA, OFFSET, PTR)        \
+    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vstore1(DATA.s8, OFFSET, PTR + 8);
+
+#define vstore_partial_10(DATA, OFFSET, PTR)       \
+    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
+
+#define vstore_partial_11(DATA, OFFSET, PTR)       \
+    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
+
+#define vstore_partial_12(DATA, OFFSET, PTR)       \
+    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
+
+#define vstore_partial_13(DATA, OFFSET, PTR)       \
+    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vstore_partial_5(DATA.s89abc, OFFSET, PTR + 8);
+
+#define vstore_partial_14(DATA, OFFSET, PTR)       \
+    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vstore_partial_6(DATA.s89abcd, OFFSET, PTR + 8);
+
+#define vstore_partial_15(DATA, OFFSET, PTR)       \
+    vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
+    vstore_partial_7(DATA.s89abcde, OFFSET, PTR + 8);
+
+#define vstore_partial_16(DATA, OFFSET, PTR) \
+    vstore16(DATA, OFFSET, PTR);
+/** @} */ // end of groupd vstore_partial_n
+/** @} */ // end of groupd VSTORE_PARTIAL
+
 // Convert built-in functions with _sat modifier are not supported in floating point so we create defines
 // without _sat to overcome this issue
 #define convert_float_sat convert_float
@@ -337,6 +516,10 @@
 #define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
     update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
 
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
+    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
+                           name##_stride_z, name##_step_z)
+
 /** Structure to hold Vector information */
 typedef struct Vector
 {
@@ -473,6 +656,32 @@
     return tensor;
 }
 
+/** Wrap 3D tensor information into an tensor structure.
+ *
+ * @param[in] ptr                           Pointer to the starting postion of the buffer
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
+ * @param[in] step_x                        stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y                        stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] stride_z                      Stride of the image in Z dimension (in bytes)
+ * @param[in] step_z                        stride_z * number of elements along Z processed per workitem(in bytes)
+ *
+ * @return A 3D tensor object
+ */
+inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+    Tensor3D tensor =
+    {
+        .ptr                           = ptr,
+        .offset_first_element_in_bytes = offset_first_element_in_bytes,
+        .stride_x                      = stride_x,
+        .stride_y                      = stride_y,
+        .stride_z                      = stride_z
+    };
+    return tensor;
+}
+
 inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
                                              uint step_w,
                                              uint mod_size)
@@ -537,4 +746,29 @@
     return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
 }
 
+/** Get the offset for a given linear index of a Tensor3D
+ *
+ * @param[in] tensor Pointer to the starting position of the buffer
+ * @param[in] width  Width of the input tensor
+ * @param[in] height Height of the input tensor
+ * @param[in] depth  Depth of the input tensor
+ * @param[in] index  Linear index
+ */
+inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
+{
+    uint num_elements = width * height;
+
+    const uint z = index / num_elements;
+
+    index %= num_elements;
+
+    const uint y = index / width;
+
+    index %= width;
+
+    const uint x = index;
+
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
+}
+
 #endif // _HELPER_H

diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 37eb246..70134af 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/histogram.cl b/src/core/CL/cl_kernels/histogram.cl
index a652b28..a93cb4d 100644
--- a/src/core/CL/cl_kernels/histogram.cl
+++ b/src/core/CL/cl_kernels/histogram.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/hog.cl b/src/core/CL/cl_kernels/hog.cl
index 407ee2f..b14f361 100644
--- a/src/core/CL/cl_kernels/hog.cl
+++ b/src/core/CL/cl_kernels/hog.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 3a84dab..a1467a0 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -862,13 +862,88 @@
 }
 #endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
 
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
 
 #define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
+
+/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
+ *  @name IM2COL1X9_NHWC_STORE
+ *
+ *  @note To use this macro for a 3x3 block, @p ROW has to be 0
+ *
+ * @param[in] VECTOR_SIZE          The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
+ * @param[in] DATA_TYPE            Data type of @p DATA
+ * @param[in] SRC_DEPTH            Input channel size / depth
+ * @param[in] DATA                 Value variable base name
+ * @param[in] ROW                  The row number to store. Supported: 0-8
+ * @param[in] OUTPUT_PTR           Output pointer
+ * @{
+ */
+#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)         \
+    const bool at_channel_boundary = get_global_id(0) == 0;                                                          \
+    if(at_channel_boundary)                                                                                          \
+    {                                                                                                                \
+        IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    }                                                                                                                \
+    else                                                                                                             \
+    {                                                                                                                \
+        IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)                    \
+    }
+#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
+#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+
+#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+
+#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+/** @}*/
 
 /** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
  *
  * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
  * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
@@ -898,9 +973,11 @@
     uint src_stride_w,
     uint dst_stride_w)
 {
-    const int ch    = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
-    const int yo    = get_global_id(1);
-    const int batch = get_global_id(2); // batch size
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
 
     // Calculate input indices
     const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
@@ -915,10 +992,11 @@
 
     // Clamp xi
     int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
     xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+    // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
     xi_offset *= (int3)src_stride_y;
 
     // Out-of-bound condition for X
@@ -928,6 +1006,9 @@
     // Clamp yi
     // yi_coord is casted to unsigned int in order to use just a min() operation
     // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
+    // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
+    // also causes y_cond (y padding condition) to be satisfied
     yi_coord = yi - (int)PAD_TOP;
 
     // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
@@ -946,9 +1027,9 @@
 #if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
     // Replace invalid values with PAD_VALUE
     int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
-    values0    = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
-    values1    = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
-    values2    = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+    values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
 #endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
 
     // yi == 1
@@ -971,9 +1052,9 @@
 #if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
     // Replace invalid values with zeros
     y_cond  = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
-    values3 = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
-    values4 = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
-    values5 = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+    values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
 #endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
 
     // yi == 2
@@ -996,32 +1077,20 @@
 #if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
     // Replace invalid values with PAD_VALUE
     y_cond  = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
-    values6 = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s0));
-    values7 = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s1));
-    values8 = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond.s2));
+    values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
 #endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
 
-    // Store
-    VSTORE(VECTOR_SIZE)
-    (values0, 0, (__global DATA_TYPE *)(output_ptr) + 0 * SRC_DEPTH);
-    VSTORE(VECTOR_SIZE)
-    (values1, 0, (__global DATA_TYPE *)(output_ptr) + 1 * SRC_DEPTH);
-    VSTORE(VECTOR_SIZE)
-    (values2, 0, (__global DATA_TYPE *)(output_ptr) + 2 * SRC_DEPTH);
-    VSTORE(VECTOR_SIZE)
-    (values3, 0, (__global DATA_TYPE *)(output_ptr) + 3 * SRC_DEPTH);
-    VSTORE(VECTOR_SIZE)
-    (values4, 0, (__global DATA_TYPE *)(output_ptr) + 4 * SRC_DEPTH);
-    VSTORE(VECTOR_SIZE)
-    (values5, 0, (__global DATA_TYPE *)(output_ptr) + 5 * SRC_DEPTH);
-    VSTORE(VECTOR_SIZE)
-    (values6, 0, (__global DATA_TYPE *)(output_ptr) + 6 * SRC_DEPTH);
-    VSTORE(VECTOR_SIZE)
-    (values7, 0, (__global DATA_TYPE *)(output_ptr) + 7 * SRC_DEPTH);
-    VSTORE(VECTOR_SIZE)
-    (values8, 0, (__global DATA_TYPE *)(output_ptr) + 8 * SRC_DEPTH);
+    // Store in a boundary-aware way to avoid padding
+    IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
 
 #ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
     if((ch + VECTOR_SIZE) >= SRC_DEPTH)
     {
         *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
@@ -1030,97 +1099,66 @@
 }
 
 #if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i)                                                                                                                                                       \
-    ({                                                                                                                                                                     \
-        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                                                                                     \
-        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                                                                                            \
+#define IM2COL1x9(i)                                                                                         \
+    ({                                                                                                       \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
         \
-        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                                                                                             \
-        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                                                                                             \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
         \
-        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));                                                                          \
-        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));                                                                          \
-        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));                                                                          \
-        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));                                                                          \
-        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));                                                                          \
-        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));                                                                          \
-        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));                                                                          \
-        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));                                                                          \
-        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));                                                                             \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
         \
-        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                                                                              \
-        values0    = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s0)); \
-        values1    = select(values1, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s1)); \
-        values2    = select(values2, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s2)); \
-        values3    = select(values3, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s3)); \
-        values4    = select(values4, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s4)); \
-        values5    = select(values5, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s5)); \
-        values6    = select(values6, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s6)); \
-        values7    = select(values7, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond0.s7)); \
-        values8    = select(values8, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))y_cond || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(x_cond1));    \
+        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                \
+        values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
+        values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
+        values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
+        values3    = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
+        values4    = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
+        values5    = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
+        values6    = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
+        values7    = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
+        values8    = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1)));    \
         \
-        VSTORE(VECTOR_SIZE)                                                                                                                                                \
-        (values0, 0, (__global DATA_TYPE *)(output_ptr) + (0 + i * 9) * SRC_DEPTH);                                                                                        \
-        VSTORE(VECTOR_SIZE)                                                                                                                                                \
-        (values1, 0, (__global DATA_TYPE *)(output_ptr) + (1 + i * 9) * SRC_DEPTH);                                                                                        \
-        VSTORE(VECTOR_SIZE)                                                                                                                                                \
-        (values2, 0, (__global DATA_TYPE *)(output_ptr) + (2 + i * 9) * SRC_DEPTH);                                                                                        \
-        VSTORE(VECTOR_SIZE)                                                                                                                                                \
-        (values3, 0, (__global DATA_TYPE *)(output_ptr) + (3 + i * 9) * SRC_DEPTH);                                                                                        \
-        VSTORE(VECTOR_SIZE)                                                                                                                                                \
-        (values4, 0, (__global DATA_TYPE *)(output_ptr) + (4 + i * 9) * SRC_DEPTH);                                                                                        \
-        VSTORE(VECTOR_SIZE)                                                                                                                                                \
-        (values5, 0, (__global DATA_TYPE *)(output_ptr) + (5 + i * 9) * SRC_DEPTH);                                                                                        \
-        VSTORE(VECTOR_SIZE)                                                                                                                                                \
-        (values6, 0, (__global DATA_TYPE *)(output_ptr) + (6 + i * 9) * SRC_DEPTH);                                                                                        \
-        VSTORE(VECTOR_SIZE)                                                                                                                                                \
-        (values7, 0, (__global DATA_TYPE *)(output_ptr) + (7 + i * 9) * SRC_DEPTH);                                                                                        \
-        VSTORE(VECTOR_SIZE)                                                                                                                                                \
-        (values8, 0, (__global DATA_TYPE *)(output_ptr) + (8 + i * 9) * SRC_DEPTH);                                                                                        \
+        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
     })
 #else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i)                                                                              \
-    ({                                                                                            \
-        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                            \
-        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                   \
+#define IM2COL1x9(i)                                                                                         \
+    ({                                                                                                       \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
         \
-        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                    \
-        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                    \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
         \
-        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
-        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
-        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
-        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
-        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
-        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
-        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
-        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
-        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));    \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
         \
-        VSTORE(VECTOR_SIZE)                                                                       \
-        (values0, 0, (__global DATA_TYPE *)(output_ptr) + (0 + i * 9) * SRC_DEPTH);               \
-        VSTORE(VECTOR_SIZE)                                                                       \
-        (values1, 0, (__global DATA_TYPE *)(output_ptr) + (1 + i * 9) * SRC_DEPTH);               \
-        VSTORE(VECTOR_SIZE)                                                                       \
-        (values2, 0, (__global DATA_TYPE *)(output_ptr) + (2 + i * 9) * SRC_DEPTH);               \
-        VSTORE(VECTOR_SIZE)                                                                       \
-        (values3, 0, (__global DATA_TYPE *)(output_ptr) + (3 + i * 9) * SRC_DEPTH);               \
-        VSTORE(VECTOR_SIZE)                                                                       \
-        (values4, 0, (__global DATA_TYPE *)(output_ptr) + (4 + i * 9) * SRC_DEPTH);               \
-        VSTORE(VECTOR_SIZE)                                                                       \
-        (values5, 0, (__global DATA_TYPE *)(output_ptr) + (5 + i * 9) * SRC_DEPTH);               \
-        VSTORE(VECTOR_SIZE)                                                                       \
-        (values6, 0, (__global DATA_TYPE *)(output_ptr) + (6 + i * 9) * SRC_DEPTH);               \
-        VSTORE(VECTOR_SIZE)                                                                       \
-        (values7, 0, (__global DATA_TYPE *)(output_ptr) + (7 + i * 9) * SRC_DEPTH);               \
-        VSTORE(VECTOR_SIZE)                                                                       \
-        (values8, 0, (__global DATA_TYPE *)(output_ptr) + (8 + i * 9) * SRC_DEPTH);               \
+        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
     })
 #endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
 
 /** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
  *
  * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
  * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
@@ -1150,9 +1188,11 @@
     uint src_stride_w,
     uint dst_stride_w)
 {
-    const int ch    = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
-    const int yo    = get_global_id(1);
-    const int batch = get_global_id(2); // batch size
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
 
     // Calculate input indices
     const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
@@ -1170,11 +1210,11 @@
     int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
     int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
 
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
     xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
     xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
     xi_offset0 *= (int8)src_stride_y;
     xi_offset1 *= (int)src_stride_y;
 
@@ -1193,6 +1233,11 @@
     IM2COL1x9(8);
 
 #ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
     if((ch + VECTOR_SIZE) >= SRC_DEPTH)
     {
         *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
@@ -1202,6 +1247,10 @@
 
 /** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
  *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
@@ -1235,9 +1284,11 @@
     uint src_stride_w,
     uint dst_stride_w)
 {
-    const int ch    = min((int)(get_global_id(0) * VECTOR_SIZE), LAST_ACCESSED); // input feature map
-    const int yo    = get_global_id(1);
-    const int batch = get_global_id(2); // batch size
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
 
     // Calculate input indices
     const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
@@ -1270,23 +1321,40 @@
 
             VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
 
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
             // Replace with PAD_VALUE if the value is out-of-bound
-            values0 = select(values0, (VECTOR_N)PAD_VALUE, (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))x_border_condition || (VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE))(y_border_condition));
+            values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
 
-            // Store
-            VSTORE(VECTOR_SIZE)
-            (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
-
+            // Store in a boundary-aware way to avoid padding
+#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+            const bool at_channel_boundary = get_global_id(0) == 0;
+            if(at_channel_boundary)
+            {
+                VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
+                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+            }
+            else // at_channel_boundary
+#endif           // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+            {
+                VSTORE(VECTOR_SIZE)
+                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+            }
             i++;
         }
     }
 
 #ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
     if((ch + VECTOR_SIZE) >= SRC_DEPTH)
     {
         *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
     }
 #endif // HAS_BIAS
 }
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(LAST_ACCESSED)
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
 #endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)

diff --git a/src/core/CL/cl_kernels/instance_normalization.cl b/src/core/CL/cl_kernels/instance_normalization.cl
index 043012b..480d9cd 100644
--- a/src/core/CL/cl_kernels/instance_normalization.cl
+++ b/src/core/CL/cl_kernels/instance_normalization.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/integral_image.cl b/src/core/CL/cl_kernels/integral_image.cl
index 970e04e..dd2c798 100644
--- a/src/core/CL/cl_kernels/integral_image.cl
+++ b/src/core/CL/cl_kernels/integral_image.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
index 70b8b36..14b37e3 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/l2_normalize.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/magnitude_phase.cl b/src/core/CL/cl_kernels/magnitude_phase.cl
index e9b5e97..48197d6 100644
--- a/src/core/CL/cl_kernels/magnitude_phase.cl
+++ b/src/core/CL/cl_kernels/magnitude_phase.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/mean_stddev.cl b/src/core/CL/cl_kernels/mean_stddev.cl
index 74d6b0b..4ddf931 100644
--- a/src/core/CL/cl_kernels/mean_stddev.cl
+++ b/src/core/CL/cl_kernels/mean_stddev.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/mean_stddev_normalization.cl
index 9667737..4141d3e 100644
--- a/src/core/CL/cl_kernels/mean_stddev_normalization.cl
+++ b/src/core/CL/cl_kernels/mean_stddev_normalization.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/memset.cl
index 7d8e0ef..bb46a49 100644
--- a/src/core/CL/cl_kernels/memset.cl
+++ b/src/core/CL/cl_kernels/memset.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
  * -# -DVEC_SIZE = Vector size
  * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might need to step back a bit)
  *
- * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: All.
  * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/minmax_layer.cl b/src/core/CL/cl_kernels/minmax_layer.cl
index 1e543b4..655696f 100644
--- a/src/core/CL/cl_kernels/minmax_layer.cl
+++ b/src/core/CL/cl_kernels/minmax_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/minmaxloc.cl b/src/core/CL/cl_kernels/minmaxloc.cl
index 0f557a4..1045f22 100644
--- a/src/core/CL/cl_kernels/minmaxloc.cl
+++ b/src/core/CL/cl_kernels/minmaxloc.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/non_linear_filter3x3.cl b/src/core/CL/cl_kernels/non_linear_filter3x3.cl
index 19118ea..93c5024 100644
--- a/src/core/CL/cl_kernels/non_linear_filter3x3.cl
+++ b/src/core/CL/cl_kernels/non_linear_filter3x3.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/non_linear_filter5x5.cl b/src/core/CL/cl_kernels/non_linear_filter5x5.cl
index d3b2958..7c87284 100644
--- a/src/core/CL/cl_kernels/non_linear_filter5x5.cl
+++ b/src/core/CL/cl_kernels/non_linear_filter5x5.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/non_linear_filter_helpers.h b/src/core/CL/cl_kernels/non_linear_filter_helpers.h
index 77da209..3fcfad4 100644
--- a/src/core/CL/cl_kernels/non_linear_filter_helpers.h
+++ b/src/core/CL/cl_kernels/non_linear_filter_helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/nonmax.cl
index 0e388d7..ab13131 100644
--- a/src/core/CL/cl_kernels/nonmax.cl
+++ b/src/core/CL/cl_kernels/nonmax.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,13 +25,13 @@
 
 /** This function performs Non maxima suppression over a 3x3 window on a given image.
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/F32
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
  * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: F32
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p scr_ptr
  * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
index 390f8fc..ff4dc8e 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/normalization_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
index a105968..f803f52 100644
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
+++ b/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
index b2ba65f..27017a0 100644
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
index 8a126a0..9bbde1a 100644
--- a/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl
+++ b/src/core/CL/cl_kernels/optical_flow_pyramid_lk.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/pad_layer.cl
index 88c401d..4e4d2ad 100644
--- a/src/core/CL/cl_kernels/pad_layer.cl
+++ b/src/core/CL/cl_kernels/pad_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,7 @@
  *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. -DPAD_W_BEFORE=3)
  *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -165,7 +165,7 @@
  * @note If the starting point to read backward from is less than the output's last element accessed in the X, the following compile flags must be passed at compile time to avoid negative offsets:
  *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/permute.cl
index 82df141..db9e7ec 100644
--- a/src/core/CL/cl_kernels/permute.cl
+++ b/src/core/CL/cl_kernels/permute.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
index 163cb23..d623226 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
index 097df82..b0bd338 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 207669e..9e6521b 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -192,22 +192,22 @@
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void pooling_layer_2(
     TENSOR3D_DECLARATION(input),
@@ -256,22 +256,22 @@
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void pooling_layer_3(
     TENSOR3D_DECLARATION(input),
@@ -344,22 +344,22 @@
  *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void pooling_layer_optimized_3(
     TENSOR3D_DECLARATION(input),
@@ -402,22 +402,22 @@
  *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
  * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
  * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void pooling_layer_MxN_nchw(
     TENSOR3D_DECLARATION(input),
@@ -515,17 +515,17 @@
  *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
  * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
  *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
  * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
  * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -534,7 +534,7 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
  * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
 __kernel void pooling_layer_MxN_nhwc(
     TENSOR4D_DECLARATION(input),
@@ -572,7 +572,7 @@
             data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y, 0));
 #else  /* defined(DST_DEPTH) */
             VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
+            data0    = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
 #endif /* defined(DST_DEPTH) */
 
 #if defined(POOL_L2)
@@ -596,3 +596,443 @@
     // Store result
     vstore8(CONVERT(vdata, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)output.ptr);
 }
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
+{
+    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
+    const int pad_vert  = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
+
+    const int x = get_global_id(0) * STRIDE_X;
+    const int y = get_global_id(1) * STRIDE_Y;
+    const int z = get_global_id(2);
+
+    //x axis: width, y axis: height, z axis: component
+    const uint padded_offset = input->offset_first_element_in_bytes
+                               + x * input->stride_x
+                               + y * input->stride_y
+                               + z * input->stride_z;
+
+    const uint offset_base = padded_offset
+                             - y * pad_horiz * sizeof(DATA_TYPE)                                               /* Horizontal padding for each row */
+                             - PAD_TENSOR_TOP * input->stride_y                                                /* top padding */
+                             - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
+                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
+
+#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
+    *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
+#else  /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+    *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
+#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+
+    *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+
+    return;
+}
+
+inline void offset_no_padding_nhwc_3D(const Tensor3D *input, uint *offset_x0, uint *offset_x1, uint *offset_x2, uint *offset_x3)
+{
+    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
+
+    const int x = get_global_id(0);
+    const int y = get_global_id(1) * STRIDE_X;
+    const int z = get_global_id(2) * STRIDE_Y;
+
+    //x axis: component, y axis: width, z axis: height
+    const uint padded_offset = input->offset_first_element_in_bytes
+                               + x * 8 * input->stride_x
+                               + y * input->stride_y
+                               + z * input->stride_z;
+
+    const uint offset_base = padded_offset
+                             - (z + 1) * PAD_TENSOR_TOP * input->stride_y    /* Top padding for each z plane */
+                             - y * pad_horiz * sizeof(DATA_TYPE)             /* Horizontal padding for each row */
+                             - z * MAX_WIDTH * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each z plane */
+                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
+
+    *offset_x0 = (uint)offset_base / sizeof(DATA_TYPE);
+    *offset_x1 = *offset_x0 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+    *offset_x2 = *offset_x0 + input->stride_z / sizeof(DATA_TYPE) - pad_horiz * MAX_WIDTH - PAD_TENSOR_TOP * input->stride_y / sizeof(DATA_TYPE);
+    *offset_x3 = *offset_x2 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+
+    return;
+}
+
+#if defined(DST_DEPTH)
+inline void offset_no_padding_nhwc_4D(const Tensor4D *input, uint *offset_x0, uint *offset_x1, uint *offset_x2, uint *offset_x3)
+{
+    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
+    const int z_max     = get_global_size(2) / BATCH_SIZE;
+
+    const int x = get_global_id(0);
+    const int y = get_global_id(1) * STRIDE_X;
+    const int z = (get_global_id(2) % z_max) * STRIDE_Y;
+    const int w = get_global_id(2) / z_max;
+
+    const unsigned int padded_offset = input->offset_first_element_in_bytes
+                                       + x * 8 * input->stride_x
+                                       + y * input->stride_y
+                                       + z * input->stride_z;
+
+    const unsigned int offset_base = padded_offset
+                                     - (z + 1) * PAD_TENSOR_TOP * input->stride_y    /* Top padding for each z plane */
+                                     - y * pad_horiz * sizeof(DATA_TYPE)             /* Horizontal padding for each row */
+                                     - z * MAX_WIDTH * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each z plane */
+                                     - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
+
+    *offset_x0 = (uint)offset_base / sizeof(DATA_TYPE);
+    *offset_x1 = *offset_x0 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+    *offset_x2 = *offset_x0 + input->stride_z / sizeof(DATA_TYPE) - pad_horiz * MAX_WIDTH - PAD_TENSOR_TOP * input->stride_y / sizeof(DATA_TYPE);
+    *offset_x3 = *offset_x2 + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+
+    return;
+}
+#endif //defined(DST_DEPTH)
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp32(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    // Get pixels pointer
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    // Load data
+    float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
+    float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    float data0_max = POOL_OP(data0.s0, data0.s1);
+    float data1_max = POOL_OP(data1.s0, data1.s1);
+    float res       = POOL_OP(data0_max, data1_max);
+    // Store result
+    *(__global float *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_top    = 0;
+    uint offset_bottom = 0;
+
+    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+    *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+}
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp16(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    // Get pixels pointer
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    // Load data
+    half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
+    half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    half data0_max = POOL_OP(data0.s0, data0.s1);
+    half data1_max = POOL_OP(data1.s0, data1.s1);
+    half res       = POOL_OP(data0_max, data1_max);
+    // Store result
+    *(__global half *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_top    = 0;
+    uint offset_bottom = 0;
+
+    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+    *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+}
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NHWC.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_stride_w                      Stride of the indices tensor in W dimension (in bytes)
+ * @param[in]  indices_step_w                        indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nhwc_indices_fp32(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output),
+    TENSOR4D_DECLARATION(indices))
+{
+    // Get pixels pointer
+#if defined(DST_DEPTH)
+    Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
+    Tensor4D output  = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+    Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT(indices, DST_DEPTH);
+#else  /* defined(DST_DEPTH) */
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+#endif /* defined(DST_DEPTH) */
+
+#if defined(DST_DEPTH)
+    // Load data
+    float8 data_top0    = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 0, 0, 0));
+    float8 data_top1    = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 1, 0, 0));
+    float8 data_bottom0 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 0, 1, 0));
+    float8 data_bottom1 = VLOAD(8)(0, (__global float *)tensor4D_offset(&input, 0, 1, 1, 0));
+#else  /* defined(DST_DEPTH) */
+    // Load data
+    float8   data_top0    = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
+    float8   data_top1    = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+    float8   data_bottom0 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 0, 1));
+    float8   data_bottom1 = VLOAD(8)(0, (__global float *)tensor3D_offset(&input, 0, 1, 1));
+#endif /* defined(DST_DEPTH) */
+
+    float8 data_top_max    = POOL_OP(data_top0, data_top1);
+    float8 data_bottom_max = POOL_OP(data_bottom0, data_bottom1);
+    float8 data_max        = POOL_OP(data_top_max, data_bottom_max);
+    vstore8(data_max, 0, (__global float *)output.ptr);
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_x0 = 0;
+    uint offset_x1 = 0;
+    uint offset_x2 = 0;
+    uint offset_x3 = 0;
+
+#if defined(DST_DEPTH)
+    offset_no_padding_nhwc_4D(&input, &offset_x0, &offset_x1, &offset_x2, &offset_x3);
+#else  /* defined(DST_DEPTH) */
+    offset_no_padding_nhwc_3D(&input, &offset_x0, &offset_x1, &offset_x2, &offset_x3);
+#endif /* defined(DST_DEPTH) */
+
+    uint8 voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3, offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
+    uint8 voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3, offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
+    uint8 voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3, offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
+    uint8 voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3, offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
+
+    uint8 index0 = select(voffset_x1, voffset_x0, isgreaterequal(data_top0, data_top1));
+    uint8 index1 = select(voffset_x3, voffset_x2, isgreaterequal(data_bottom0, data_bottom1));
+    uint8 index  = select(index1, index0, isgreaterequal(data_top_max, data_bottom_max));
+    vstore8(index, 0, (__global uint *)indices.ptr);
+
+#endif /* defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM */
+}
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NHWC.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_stride_w                      Stride of the indices tensor in W dimension (in bytes)
+ * @param[in]  indices_step_w                        indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nhwc_indices_fp16(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output),
+    TENSOR4D_DECLARATION(indices))
+{
+    // Get pixels pointer
+#if defined(DST_DEPTH)
+    Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT(input, DST_DEPTH);
+    Tensor4D output  = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+    Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT(indices, DST_DEPTH);
+#else  /* defined(DST_DEPTH) */
+    Tensor3D input        = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output       = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices      = CONVERT_TO_TENSOR3D_STRUCT(indices);
+#endif /* defined(DST_DEPTH) */
+
+#if defined(DST_DEPTH)
+    // Load data
+    half8 data_top0    = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 0, 0, 0));
+    half8 data_top1    = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 1, 0, 0));
+    half8 data_bottom0 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 0, 1, 0));
+    half8 data_bottom1 = VLOAD(8)(0, (__global half *)tensor4D_offset(&input, 0, 1, 1, 0));
+#else  /* defined(DST_DEPTH) */
+    // Load data
+    half8 data_top0    = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
+    half8 data_top1    = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
+    half8 data_bottom0 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 0, 1));
+    half8 data_bottom1 = VLOAD(8)(0, (__global half *)tensor3D_offset(&input, 0, 1, 1));
+#endif /* defined(DST_DEPTH) */
+
+    half8 data_top_max    = POOL_OP(data_top0, data_top1);
+    half8 data_bottom_max = POOL_OP(data_bottom0, data_bottom1);
+    half8 data_max        = POOL_OP(data_top_max, data_bottom_max);
+    vstore8(data_max, 0, (__global half *)output.ptr);
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_x0_int = 0;
+    uint offset_x1_int = 0;
+    uint offset_x2_int = 0;
+    uint offset_x3_int = 0;
+
+#if defined(DST_DEPTH)
+    offset_no_padding_nhwc_4D(&input, &offset_x0_int, &offset_x1_int, &offset_x2_int, &offset_x3_int);
+#else  /* defined(DST_DEPTH) */
+    offset_no_padding_nhwc_3D(&input, &offset_x0_int, &offset_x1_int, &offset_x2_int, &offset_x3_int);
+#endif /* defined(DST_DEPTH) */
+
+    ushort offset_x0 = (ushort)offset_x0_int;
+    ushort offset_x1 = (ushort)offset_x1_int;
+    ushort offset_x2 = (ushort)offset_x2_int;
+    ushort offset_x3 = (ushort)offset_x3_int;
+
+    ushort8 voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3, offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
+    ushort8 voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3, offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
+    ushort8 voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3, offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
+    ushort8 voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3, offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
+
+    ushort8 index0 = select(voffset_x1, voffset_x0, isgreaterequal(data_top0, data_top1));
+    ushort8 index1 = select(voffset_x3, voffset_x2, isgreaterequal(data_bottom0, data_bottom1));
+    ushort8 index  = select(index1, index0, isgreaterequal(data_top_max, data_bottom_max));
+    vstore8(CONVERT(index, uint8), 0, (__global uint *)indices.ptr);
+
+#endif /* defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM */
+}
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
index 3a370ee..fe13464 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/prior_box_layer.cl
index 046151b..de10dec 100644
--- a/src/core/CL/cl_kernels/prior_box_layer.cl
+++ b/src/core/CL/cl_kernels/prior_box_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/qlstm_layer_normalization.cl
index 08f0b53..24cb111 100644
--- a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl
+++ b/src/core/CL/cl_kernels/qlstm_layer_normalization.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/quantization_layer.cl
index cfb2bb6..3538dae 100644
--- a/src/core/CL/cl_kernels/quantization_layer.cl
+++ b/src/core/CL/cl_kernels/quantization_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/range.cl b/src/core/CL/cl_kernels/range.cl
index d122c9a..1e5c77b 100644
--- a/src/core/CL/cl_kernels/range.cl
+++ b/src/core/CL/cl_kernels/range.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index b9d33bd..b2e5692 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -167,11 +167,11 @@
  * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
  * @note In case of MIN and MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
  *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8 for operation MEAN
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
  * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptr
  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
@@ -233,13 +233,13 @@
  * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
  *
- * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
+ * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
  * @param[in] src_stride_x                         Stride of the source tensor in X dimension (in bytes)
  * @param[in] src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptr
  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
@@ -316,7 +316,7 @@
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
  *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -324,7 +324,7 @@
  * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
  * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptt
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
@@ -420,7 +420,7 @@
  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
  * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
  *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
  * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -430,7 +430,7 @@
  * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
  * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptt
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
  * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
  * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/remap.cl
index e0f3bf3..0f013c5 100644
--- a/src/core/CL/cl_kernels/remap.cl
+++ b/src/core/CL/cl_kernels/remap.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/reorg_layer.cl b/src/core/CL/cl_kernels/reorg_layer.cl
index a275699..29344de 100644
--- a/src/core/CL/cl_kernels/reorg_layer.cl
+++ b/src/core/CL/cl_kernels/reorg_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,7 @@
  * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
  * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -79,7 +79,7 @@
  * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
  * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
index 4e761db..59bf5b9 100644
--- a/src/core/CL/cl_kernels/repeat.h
+++ b/src/core/CL/cl_kernels/repeat.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/reshape_layer.cl b/src/core/CL/cl_kernels/reshape_layer.cl
index 4bfdf1e..2d6a7ed 100644
--- a/src/core/CL/cl_kernels/reshape_layer.cl
+++ b/src/core/CL/cl_kernels/reshape_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/reverse.cl b/src/core/CL/cl_kernels/reverse.cl
index 6afd382..10ffe84 100644
--- a/src/core/CL/cl_kernels/reverse.cl
+++ b/src/core/CL/cl_kernels/reverse.cl

@@ -1,26 +1,26 @@
 /*
-* Copyright (c) 2018 ARM Limited.
-*
-* SPDX-License-Identifier: MIT
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in all
-* copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-* SOFTWARE.
-*/
+* Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
 #include "helpers.h"
 
 #if defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
@@ -34,7 +34,7 @@
  * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
  * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
  *
- * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/roi_align_layer.cl
index 430369b..e0b98e6 100644
--- a/src/core/CL/cl_kernels/roi_align_layer.cl
+++ b/src/core/CL/cl_kernels/roi_align_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
index 8093623..d5c9a0d 100644
--- a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/roi_align_layer_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/roi_pooling_layer.cl
index 0cf296c..ac193e8 100644
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ b/src/core/CL/cl_kernels/roi_pooling_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index 499f9ea..a01ff89 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,10 +100,14 @@
     const float scale_x,
     const float scale_y)
 {
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = clamp_to_border_with_size(transform_nearest(get_current_coords(), r), input_width, input_height, BORDER_SIZE);
+    Image        in          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out         = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r           = (float2)(scale_x, scale_y);
+    float8       transformed = transform_nearest(get_current_coords(), r);
+#ifdef ALIGN_CORNERS
+    transformed = round(transformed);
+#endif // ALIGN_CORNERS
+    const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
     vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
 }
 
@@ -182,14 +186,18 @@
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
 
 #ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * scale_x;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+    float new_x = get_global_id(1) * scale_x;
+    float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
 #elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * scale_x;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
+    float new_x = (get_global_id(1) + 0.5f) * scale_x;
+    float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
 #else /* SAMPLING_POLICY */
 #error("Unsupported sampling policy");
 #endif /* SAMPLING_POLICY */
+#ifdef ALIGN_CORNERS
+    new_x = round(new_x);
+    new_y = round(new_y);
+#endif /* ALIGN_CORNERS */
     const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
     const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
 

diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl
index ccbd71e..2aa7f18 100644
--- a/src/core/CL/cl_kernels/scale_quantized.cl
+++ b/src/core/CL/cl_kernels/scale_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/scharr_filter.cl b/src/core/CL/cl_kernels/scharr_filter.cl
index d9b5d07..d2868b6 100644
--- a/src/core/CL/cl_kernels/scharr_filter.cl
+++ b/src/core/CL/cl_kernels/scharr_filter.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/select.cl
index d783ae2..52ef815 100644
--- a/src/core/CL/cl_kernels/select.cl
+++ b/src/core/CL/cl_kernels/select.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@
  * @param[in]  c_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  c_step_z                          c_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
  * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -46,7 +46,7 @@
  * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
  * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -54,7 +54,7 @@
  * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -98,7 +98,7 @@
  * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
  * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -106,7 +106,7 @@
  * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
  * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -114,7 +114,7 @@
  * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -162,7 +162,7 @@
  * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
  * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -170,7 +170,7 @@
  * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
  * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
  * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
@@ -178,7 +178,7 @@
  * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
  * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/slice_ops.cl
index 5dc0f2d..dc3ffd9 100644
--- a/src/core/CL/cl_kernels/slice_ops.cl
+++ b/src/core/CL/cl_kernels/slice_ops.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
index fc2b0ee..7983734 100644
--- a/src/core/CL/cl_kernels/sobel_filter.cl
+++ b/src/core/CL/cl_kernels/sobel_filter.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
index 767cf4c..77dbb47 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/softmax_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -112,6 +112,7 @@
     VEC_DATA_TYPE(DATA_TYPE, 16)
     data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
 #ifdef LOG_SOFTMAX
+    sum_val = log(sum_val);
     vstore16(SUB_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
 #else  /* LOG_SOFTMAX */
     vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));

diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
index 5d35e50..22b8df8 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/softmax_layer_quantized.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -96,7 +96,7 @@
  * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
  * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
  *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
@@ -194,7 +194,7 @@
         data_fp                = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
         VSTORE(VECTOR_SIZE)
         (data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
-        sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
     }
 
 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
@@ -208,8 +208,8 @@
     VEC_INT widx_          = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);
     VSTORE(VECTOR_SIZE)
     (data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));
-    data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    sum1D   = sum1D + select(MIN_VALUE, data_fp, widx_);
+    data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+    sum1D   = sum1D + select(0, data_fp, widx_);
 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 
     // Perform sum reduction
@@ -417,7 +417,7 @@
         data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
         data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
         vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
-        sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
     }
 #ifdef NON_MULTIPLE_OF_GRID_SIZE
     //TODO: Optimize the calculation (avoid %).
@@ -432,7 +432,7 @@
         data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
         data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
         vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
-        sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
     }
 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
     if(boundary_workitems == 0)
@@ -451,9 +451,10 @@
         data_fp             = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
         data_fp             = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
         int4 widx           = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
-        data_fp             = select(MIN_VALUE, data_fp, widx);
         vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));
-        sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
+        data_fp = select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
+        data_fp = select(0, data_fp, widx);
+        sum1D   = sum1D + data_fp;
     }
 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
 #endif /* NON_MULTIPLE_OF_GRID_SIZE */
@@ -548,7 +549,7 @@
  * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
  * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
  * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
@@ -570,14 +571,12 @@
     int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
 
     // It will be better to calculate this in prev layer and pass here as parameter
-#ifndef LOG_SOFTMAX
     uint  sum_val_u               = convert_uint(sum_val);
     int   headroom_plus_one       = clz(sum_val_u);
     int   num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
     int   shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
     int16 shifted_sum_minus_one   = shifted_sum_minus_one_1;
     int16 shifted_scale           = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);
-#endif /* LOG_SOFTMAX */
 
     // It was already calculated in prev layer, should be stored into tmp output and reused
     int16 data_diff      = vload16(0, (__global int *)offset(&src, 0, 0));
@@ -589,18 +588,13 @@
     }
 #endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
 
-#ifdef LOG_SOFTMAX
-    long16 data = SUB_OP(convert_long16(data_diff_mult), (long16)(sum_val), long, 16);
-    data        = select(0L, data, convert_long16(data_diff) >= (long16)(DIFF_MIN));
-#else /* LOG_SOFTMAX */
     int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);
     data       = ASYMM_MULT(shifted_scale, data, 16);
     data       = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);
 #ifdef QASYMM8_SIGNED
-    data       = ADD_OP(data, (int16)(MIN_VALUE), int, 16);
+    data = ADD_OP(data, (int16)(MIN_VALUE), int, 16);
 #endif /* QASYMM8_SIGNED */
-    data       = select(MIN_VALUE, data, data_diff >= (int16)(DIFF_MIN));
-#endif /* LOG_SOFTMAX */
+    data = select(MIN_VALUE, data, data_diff >= (int16)(DIFF_MIN));
     vstore16(CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
 }
 

diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/space_to_batch.cl
index 3098b25..5ade9c5 100644
--- a/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/src/core/CL/cl_kernels/space_to_batch.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/space_to_depth.cl b/src/core/CL/cl_kernels/space_to_depth.cl
index 8dbada2..1217a37 100644
--- a/src/core/CL/cl_kernels/space_to_depth.cl
+++ b/src/core/CL/cl_kernels/space_to_depth.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/stack_layer.cl b/src/core/CL/cl_kernels/stack_layer.cl
index bed6266..438e858 100644
--- a/src/core/CL/cl_kernels/stack_layer.cl
+++ b/src/core/CL/cl_kernels/stack_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -66,7 +66,7 @@
  * @note Dimension 2 of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=112)
  * @note Dimension 3 of the output tensor must be passed at compile time using -DDST_DIM3 (e.g. -DDST_DIM3=112)
  *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/tablelookup.cl b/src/core/CL/cl_kernels/tablelookup.cl
index cee116b..0ef1648 100644
--- a/src/core/CL/cl_kernels/tablelookup.cl
+++ b/src/core/CL/cl_kernels/tablelookup.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/threshold.cl b/src/core/CL/cl_kernels/threshold.cl
index 2b1e6ff..ff3ac05 100644
--- a/src/core/CL/cl_kernels/threshold.cl
+++ b/src/core/CL/cl_kernels/threshold.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/tile.cl b/src/core/CL/cl_kernels/tile.cl
index 1c8de67..79da7fe 100644
--- a/src/core/CL/cl_kernels/tile.cl
+++ b/src/core/CL/cl_kernels/tile.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl
index c993005..785be6c 100644
--- a/src/core/CL/cl_kernels/transpose.cl
+++ b/src/core/CL/cl_kernels/transpose.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -132,7 +132,7 @@
  *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
  *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
  *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/types.h b/src/core/CL/cl_kernels/types.h
index 8773646..7d56acd 100644
--- a/src/core/CL/cl_kernels/types.h
+++ b/src/core/CL/cl_kernels/types.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/unpooling_layer.cl b/src/core/CL/cl_kernels/unpooling_layer.cl
new file mode 100644
index 0000000..457e9bf
--- /dev/null
+++ b/src/core/CL/cl_kernels/unpooling_layer.cl

@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+/** Performs max unpooling function with pool size equal to 2.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
+ * @note The width of the output tensor must be passed using -DWIDTH_DST e.g. -DWIDTH_DST=24
+ * @note The height of the output tensor must be passed using -DHEIGHT_DST e.g. -DHEIGHT_DST=54
+ * @note The depth of the output tensor must be passed using -DDEPTH_DST e.g. -DDEPTH_DST=32
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the output tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the output tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the output tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the output tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the output tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void max_unpooling_layer_2(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    unsigned int index = *((__global unsigned int *)indices.ptr);
+    DATA_TYPE value    = *((__global DATA_TYPE *)input.ptr);
+
+    *((__global DATA_TYPE *)tensor3D_index2ptr(&output, WIDTH_DST, HEIGHT_DST, DEPTH_DST, index)) = value;
+}
\ No newline at end of file

diff --git a/src/core/CL/cl_kernels/upsample_layer.cl b/src/core/CL/cl_kernels/upsample_layer.cl
index 65912f5..d0cc0f2 100644
--- a/src/core/CL/cl_kernels/upsample_layer.cl
+++ b/src/core/CL/cl_kernels/upsample_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,13 +26,13 @@
 /** This function applies upsample on an input image. (NCHW)
  *
  * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
  * -# -DVEC_SIZE_IN = Input vector size
  * -# -DVEC_SIZE_OUT = Output vector size
  * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
  * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
@@ -81,13 +81,13 @@
 /** This function applies upsample on an input image. (NHWC)
  *
  * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
  * -# -DVEC_SIZE_IN = Input vector size
  * -# -DVEC_SIZE_OUT = Output vector size
  * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
  * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
  *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
  * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
  * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)

diff --git a/src/core/CL/cl_kernels/warp_affine.cl b/src/core/CL/cl_kernels/warp_affine.cl
index f41821c..909b920 100644
--- a/src/core/CL/cl_kernels/warp_affine.cl
+++ b/src/core/CL/cl_kernels/warp_affine.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 9afec7d..6482825 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/warp_helpers_quantized.h b/src/core/CL/cl_kernels/warp_helpers_quantized.h
index fc9788f..ca21be6 100644
--- a/src/core/CL/cl_kernels/warp_helpers_quantized.h
+++ b/src/core/CL/cl_kernels/warp_helpers_quantized.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/warp_perspective.cl b/src/core/CL/cl_kernels/warp_perspective.cl
index 6ffb7e4..bed7838 100644
--- a/src/core/CL/cl_kernels/warp_perspective.cl
+++ b/src/core/CL/cl_kernels/warp_perspective.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/winograd_filter_transform.cl
index 3f203b8..5c3bb8a 100644
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_filter_transform.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
index 630a78b..48a4e0d 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_input_transform.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
index 8140cad..efd8502 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/winograd_output_transform.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -157,8 +157,8 @@
 
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    const const VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0_dt                                            = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
     *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
     *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
 #else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -288,7 +288,7 @@
     // Get output address
     int offset = dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
     VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+    out0_dt                                                      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
     *(__global DATA_TYPE *)(dst_ptr + 0 * dst_stride_y + offset) = out0_dt.s0;
     *(__global DATA_TYPE *)(dst_ptr + 1 * dst_stride_y + offset) = out0_dt.s1;
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
@@ -599,7 +599,8 @@
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    out0_dt                                                = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
+                                                                        B_VAL);
     *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
     *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
     *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
@@ -839,7 +840,7 @@
 
     // Store the 1x4 output tile
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    out0_dt                                        = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
     *((__global DATA_TYPE *)(dst_ptr + offset.s0)) = out0_dt.s0;
     *((__global DATA_TYPE *)(dst_ptr + offset.s1)) = out0_dt.s1;
     *((__global DATA_TYPE *)(dst_ptr + offset.s2)) = out0_dt.s2;
@@ -875,7 +876,7 @@
     out2_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
     VEC_DATA_TYPE(DATA_TYPE, 4)
     out3_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33),
-                                                              VEC_DATA_TYPE(DATA_TYPE, 4)),
+                                                             VEC_DATA_TYPE(DATA_TYPE, 4)),
                          A_VAL, B_VAL);
     *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out0_dt.s0;
     *((__global DATA_TYPE *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out0_dt.s1;
@@ -1011,7 +1012,8 @@
     // Store the output tile
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    out0_dt                                                = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL,
+                                                                        B_VAL);
     *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
     *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
     *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
@@ -1239,7 +1241,7 @@
     offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, (int4)dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).
 
     VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
+    out0_dt                                      = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, CONVERT((VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), VEC_DATA_TYPE(DATA_TYPE, 4)), A_VAL, B_VAL);
     *(__global DATA_TYPE *)(dst_ptr + offset.s0) = out0_dt.s0;
     *(__global DATA_TYPE *)(dst_ptr + offset.s1) = out0_dt.s1;
     *(__global DATA_TYPE *)(dst_ptr + offset.s2) = out0_dt.s2;

diff --git a/src/core/CL/cl_kernels/yolo_layer.cl b/src/core/CL/cl_kernels/yolo_layer.cl
index e59396d..2a15a32 100644
--- a/src/core/CL/cl_kernels/yolo_layer.cl
+++ b/src/core/CL/cl_kernels/yolo_layer.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/gemm/CLGEMMHelpers.cpp b/src/core/CL/gemm/CLGEMMHelpers.cpp
index 4597d79..5734c93 100644
--- a/src/core/CL/gemm/CLGEMMHelpers.cpp
+++ b/src/core/CL/gemm/CLGEMMHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,11 @@
  */
 #include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/ITensorInfo.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -30,26 +35,50 @@
 namespace cl_gemm
 {
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose)
+                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image)
 {
-    GEMMLHSMatrixInfo lhs_info;
-    GEMMRHSMatrixInfo rhs_info;
+    v0 = ((m / (m0 * v0)) == 0) ? 1 : v0;
+    h0 = ((n / (n0 * h0)) == 0) ? 1 : h0;
 
-    // Configure GEMMLHSMatrixInfo
-    lhs_info.m0         = m0;
-    lhs_info.k0         = k0;
-    lhs_info.v0         = ((m / (lhs_info.m0 * v0)) == 0) ? 1 : v0;
-    lhs_info.interleave = lhs_interleave;
-    lhs_info.transpose  = lhs_transpose;
-
-    // Configure GEMMRHSMatrixInfo
-    rhs_info.n0         = n0;
-    rhs_info.k0         = lhs_info.k0;
-    rhs_info.h0         = ((n / (rhs_info.n0 * h0)) == 0) ? 1 : h0;
-    rhs_info.interleave = rhs_interleave;
-    rhs_info.transpose  = rhs_transpose;
+    const GEMMLHSMatrixInfo lhs_info(m0, k0, v0, lhs_transpose, lhs_interleave);
+    const GEMMRHSMatrixInfo rhs_info(n0, k0, h0, rhs_transpose, rhs_interleave, export_to_cl_image);
 
     return std::make_pair(lhs_info, rhs_info);
 }
+
+void update_padding_for_cl_image(ITensorInfo *tensor)
+{
+    constexpr unsigned int num_floats_per_pixel = 4;
+
+    const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size();
+    const unsigned int pixel_aligment       = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
+    const unsigned int row_pitch_alignment  = pixel_aligment * num_floats_per_pixel;
+    const unsigned int round_up_width       = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
+    const unsigned int padding              = round_up_width - stride_y_in_elements;
+
+    tensor->extend_padding(PaddingSize(0, padding, 0, 0));
+}
+
+Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
+{
+    if(rhs_info.export_to_cl_image)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.data_type() != DataType::F32, "Export to cl_image only supported with F32 data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment");
+
+        // Check the width and height of the output tensor.
+        // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension
+        const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+        const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image");
+    }
+
+    return Status{};
+}
 } // namespace cl_gemm
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
index c6b51c6..51b7fc7 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp
index 86c056f..3e7c176 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
index c25cdac..efc82fb 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
index 990cc72..a533f14 100644
--- a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <map>
 #include <utility>
@@ -35,6 +38,8 @@
 {
 namespace cl_gemm
 {
+using namespace arm_compute::misc::shape_calculator;
+
 CLGEMMReshapedKernelConfigurationBifrost::CLGEMMReshapedKernelConfigurationBifrost(GPUTarget gpu)
     : ICLGEMMKernelConfiguration(gpu)
 {
@@ -153,13 +158,48 @@
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    // Get lhs_info/rhs_info in case of OpenCL buffer
     if(n <= 4)
     {
-        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
     }
     else
     {
-        return configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true);
+        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true);
+    }
+
+    // Get lhs_info/rhs_info in case of OpenCL image
+    // Condition on the GPU workload
+    if((m / 4) * (n / 4) >= 2560)
+    {
+        // Big workload
+        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
+    }
+    else
+    {
+        // Small workload
+        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
+    }
+
+    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
+    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
+    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
+
+    // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
+    const bool use_cl_image2d = (n <= 4) ? false : true;
+
+    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    {
+        return std::make_pair(lhs_info_img, rhs_info_img);
+    }
+    else
+    {
+        return std::make_pair(lhs_info_buf, rhs_info_buf);
     }
 }
 

diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
index b96dc96..0c09f50 100644
--- a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
index 8826cca..f9b65dc 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <map>
 #include <utility>
@@ -35,6 +38,8 @@
 {
 namespace cl_gemm
 {
+using namespace arm_compute::misc::shape_calculator;
+
 CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget gpu)
     : ICLGEMMKernelConfiguration(gpu)
 {
@@ -139,14 +144,64 @@
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    const bool is_workload_big = ((m * n * b) / 16) >= 2048;
+    // Get lhs_info/rhs_info in case of OpenCL buffer
     if(m == 1)
     {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+        if((n / 4) >= 2048)
+        {
+            const unsigned int h0 = std::max(n / 4, 1U);
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 2, 1U);
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+        }
     }
     else
     {
-        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
+        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
+        if(is_workload_big)
+        {
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
+        }
+    }
+
+    // Get lhs_info/rhs_info in case of OpenCL image
+    const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
+    if(is_workload_big)
+    {
+        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
+    }
+    else
+    {
+        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
+    }
+
+    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
+    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
+    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
+
+    // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
+    const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
+
+    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    {
+        return std::make_pair(lhs_info_img, rhs_info_img);
+    }
+    else
+    {
+        return std::make_pair(lhs_info_buf, rhs_info_buf);
     }
 }
 

diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
index 819a297..9f3461e 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <map>
 #include <utility>
@@ -35,6 +38,8 @@
 {
 namespace cl_gemm
 {
+using namespace arm_compute::misc::shape_calculator;
+
 CLGEMMReshapedOnlyRHSKernelConfigurationValhall::CLGEMMReshapedOnlyRHSKernelConfigurationValhall(GPUTarget gpu)
     : ICLGEMMKernelConfiguration(gpu)
 {
@@ -74,32 +79,66 @@
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_UNUSED(b);
 
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    // Get lhs_info/rhs_info in case of OpenCL buffer
     if(m == 1)
     {
-        if(n > 2048)
-        {
-            return configure_lhs_rhs_info(m, n, 1, 8, 2, 1, 256, false, true, false, true);
-        }
-        else
-        {
-            return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 256, false, true, false, true);
-        }
+        const unsigned int h0 = std::max(n / 4, 1U);
+        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
     }
     else
     {
-        if(m > 300)
+        if(m > 256)
         {
-            const int v0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-            return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, v0, false, true, false, true);
+            const int v0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, v0, false, true, false, true);
         }
         else
         {
-            const int v0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-            return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, v0, false, true, false, true);
+            const int v0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, v0, false, true, false, true);
         }
     }
+
+    // Get lhs_info/rhs_info in case of OpenCL image
+    if(m == 1)
+    {
+        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 8, true, true, false, false, true);
+    }
+    else
+    {
+        if((m / 4) * (n / 4) > 4096)
+        {
+            const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
+            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
+        }
+        else
+        {
+            const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(8)), static_cast<int>(1));
+            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, h0, false, true, false, false, true);
+        }
+    }
+
+    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
+    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
+    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
+
+    // In case of small workloads, we use the OpenCL buffer rather than the OpenCL image2d
+    const bool use_cl_image2d = ((m / lhs_info_img.m0) * (n / rhs_info_img.n0)) * b < 1024 ? false : true;
+
+    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    {
+        return std::make_pair(lhs_info_img, rhs_info_img);
+    }
+    else
+    {
+        return std::make_pair(lhs_info_buf, rhs_info_buf);
+    }
 }
 
 std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
@@ -120,7 +159,7 @@
             return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
         }
     }
-    else if (m < 128)
+    else if(m < 128)
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
         if(k >= 512)

diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
index 5b03fb5..9deb165 100644
--- a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
+++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp
index a7dfcdc..f161906 100644
--- a/src/core/CL/kernels/CLAccumulateKernel.cpp
+++ b/src/core/CL/kernels/CLAccumulateKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index d40e9a1..62cafc5 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,16 +27,14 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/float_ops.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "support/StringSupport.h"
 
-#include <cmath>
 #include <set>
 
 namespace arm_compute
@@ -46,7 +44,7 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
 
     static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations =
     {
@@ -116,16 +114,11 @@
 } // namespace
 
 CLActivationLayerKernel::CLActivationLayerKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+    : _run_in_place(false)
 {
 }
 
-void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
-}
-
-void CLActivationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *output, ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -134,14 +127,13 @@
     if(output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output->info(),
-                           *input->info()->clone());
+        auto_init_if_empty(*output, *input->clone());
     }
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, (output != nullptr) ? output : nullptr, act_info));
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-    const DataType     dt                                = input->info()->data_type();
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
+    const DataType     dt                                = input->data_type();
     float              a_const                           = act_info.a();
     float              b_const                           = act_info.b();
 
@@ -163,7 +155,7 @@
     // Set quantization info build options
     if(is_quantized)
     {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
+        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
 
         if(!perform_activation_in_float)
         {
@@ -214,7 +206,7 @@
         // Set scale and offset of the input and output if they have different quantization info
         if(output != nullptr)
         {
-            const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+            const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
 
             if(iq_info != oq_info)
             {
@@ -233,12 +225,8 @@
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
-    // Make sure _kernel is initialized before calling the parent's configure
-    _input  = input;
-    _output = output;
-
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info());
+    auto win_config = validate_and_configure_window(input, (_run_in_place) ? nullptr : output);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -246,9 +234,9 @@
     _config_id = "activation_layer_";
     _config_id += lower_string(string_from_data_type(dt));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += support::cpp11::to_string(input->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += support::cpp11::to_string(input->dimension(1));
 }
 
 Status CLActivationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -260,21 +248,25 @@
     return Status{};
 }
 
-void CLActivationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLActivationLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst);
+
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, src, slice);
         if(!_run_in_place)
         {
-            add_3D_tensor_argument(idx, _output, slice);
+            add_3D_tensor_argument(idx, dst, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
     }

diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index b86e43e..b78ac27 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");

diff --git a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
index 2182019..feebe01 100644
--- a/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,20 +27,15 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 
 #include "support/StringSupport.h"
 
-#include <map>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output)
@@ -80,34 +75,27 @@
 } // namespace
 
 CLBatchConcatenateLayerKernel::CLBatchConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _batch_offset(0)
+    : _batch_offset(0)
 {
 }
 
-void CLBatchConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int batch_offset, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, batch_offset, output);
-}
-
-void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int batch_offset, ICLTensor *output)
+void CLBatchConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), batch_offset, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, batch_offset, output));
 
-    _input        = input;
-    _output       = output;
     _batch_offset = batch_offset;
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
     {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
@@ -119,13 +107,13 @@
     _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), batch_offset, output->info());
+    auto win_config = validate_and_configure_window(input, batch_offset, output);
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
     ICLKernel::configure_internal(std::get<1>(win_config));
 
     // Set output valid region
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 
     // Set config_id for enabling LWS tuning
     _config_id = "concatenate_";
@@ -133,13 +121,13 @@
     _config_id += "_";
     _config_id += support::cpp11::to_string(batch_offset);
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(0));
+    _config_id += support::cpp11::to_string(input->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(1));
+    _config_id += support::cpp11::to_string(input->dimension(1));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(2));
+    _config_id += support::cpp11::to_string(input->dimension(2));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input->info()->dimension(3));
+    _config_id += support::cpp11::to_string(input->dimension(3));
 }
 
 Status CLBatchConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
@@ -151,14 +139,17 @@
     return Status{};
 }
 
-void CLBatchConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLBatchConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
     Window slice = window.first_slice_window_3D();
 
-    const int offset_to_first_elements_in_bytes = _batch_offset * _output->info()->strides_in_bytes()[3];
+    const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3];
 
     unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
@@ -166,9 +157,10 @@
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
     }
     while(window.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 9db175d..a2cabcf 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index e899be9..c74f7e0 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
index 45622aa..44378c8 100644
--- a/src/core/CL/kernels/CLBitwiseAndKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseAndKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.cpp b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
index 0ad20a1..08e4c54 100644
--- a/src/core/CL/kernels/CLBitwiseNotKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseNotKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
index a911dd9..77c48e6 100644
--- a/src/core/CL/kernels/CLBitwiseOrKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseOrKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
index 084991a..a15305e 100644
--- a/src/core/CL/kernels/CLBitwiseXorKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseXorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 55c6f84..95ea3d7 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
index d665845..7916dce 100644
--- a/src/core/CL/kernels/CLBox3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
index 95b0397..b8a5365 100644
--- a/src/core/CL/kernels/CLCannyEdgeKernel.cpp
+++ b/src/core/CL/kernels/CLCannyEdgeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
index 017d98f..b0e5111 100644
--- a/src/core/CL/kernels/CLChannelCombineKernel.cpp
+++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
index 669d6c5..13ae8f5 100644
--- a/src/core/CL/kernels/CLChannelExtractKernel.cpp
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index d4eafec..ad000ba 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 5adb9ef..4050b24 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
index e9612f3..e14b871 100644
--- a/src/core/CL/kernels/CLColorConvertKernel.cpp
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index 2161907..5bb1d56 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
index 9670fae..7c61146 100644
--- a/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
index 2476180..ca07e68 100644
--- a/src/core/CL/kernels/CLConvolutionKernel.cpp
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
index a864502..37c3241 100644
--- a/src/core/CL/kernels/CLCopyKernel.cpp
+++ b/src/core/CL/kernels/CLCopyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLCropKernel.cpp b/src/core/CL/kernels/CLCropKernel.cpp
index eb1ab7a..f828162 100644
--- a/src/core/CL/kernels/CLCropKernel.cpp
+++ b/src/core/CL/kernels/CLCropKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -99,7 +99,7 @@
 {
     ARM_COMPUTE_UNUSED(extrapolation_value, output_window);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);

diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index a368fae..e8f12d5 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index 7a4b7df..6973034 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
index 65b6036..5978a02 100644
--- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,20 +27,15 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 
 #include "support/StringSupport.h"
 
-#include <map>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output)
@@ -78,34 +73,27 @@
 } // namespace
 
 CLDepthConcatenateLayerKernel::CLDepthConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _depth_offset(0)
+    : _depth_offset(0)
 {
 }
 
-void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, depth_offset, output);
-}
-
-void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
+void CLDepthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), depth_offset, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, depth_offset, output));
 
-    _input        = input;
-    _output       = output;
     _depth_offset = depth_offset;
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
     {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
@@ -117,13 +105,13 @@
     _kernel = create_kernel(compile_context, "concatenate", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), depth_offset, output->info());
+    auto win_config = validate_and_configure_window(input, depth_offset, output);
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
     ICLKernel::configure_internal(std::get<1>(win_config));
 
     // Set output valid region
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 }
 
 Status CLDepthConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input,
@@ -135,14 +123,17 @@
     return Status{};
 }
 
-void CLDepthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLDepthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
     Window slice = window.first_slice_window_3D();
 
-    const int offset_to_first_elements_in_bytes = _depth_offset * _output->info()->strides_in_bytes()[2];
+    const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2];
 
     unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
@@ -150,9 +141,10 @@
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
     }
     while(window.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
index 868d4ef..11297e7 100644
--- a/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthConvertLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index ffd3155..b16c961 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index 936cdd8..066e9a5 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index fe72260..0930fee 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index e6c9861..a538ab5 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
index d284203..07f25a8 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index ec9b5cb..72eac85 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
index 595ff9b..ab5f9da 100644
--- a/src/core/CL/kernels/CLDerivativeKernel.cpp
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
index 7cba97f..ae94831 100644
--- a/src/core/CL/kernels/CLDilateKernel.cpp
+++ b/src/core/CL/kernels/CLDilateKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 2fcc82f..d5d808a 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,20 +60,9 @@
                                     "Weights feature map dimension should match the respective input's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5) && std::get<0>(conv_info.stride()) > 2,
-                                    "Strides larger than 2 not supported for 3x3 convolution.");
-
-    const auto data_type = input->data_type();
-
-    if(weights->dimension(width_idx) == 9)
-    {
-        const auto supported_data_layout = is_data_type_quantized(data_type) ? DataLayout::NCHW : DataLayout::NHWC;
-        const auto error_message         = std::string("Only " + string_from_data_layout(supported_data_layout) + " layout is supported for 9x9 convolution with " + string_from_data_type(
-                                                           data_type)
-                                                       + " type");
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((supported_data_layout != data_layout), error_message.c_str());
-    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9)
+                                    && std::get<0>(conv_info.stride()) > 2,
+                                    "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
 
     if(biases != nullptr)
     {
@@ -99,6 +88,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
+    const auto data_type = input->data_type();
     if(is_data_type_quantized(data_type))
     {
         const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
@@ -358,7 +348,6 @@
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
-    // TODO(COMPMID-2078): input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
     auto_init_if_empty(*output, output_shape,
                        1,
                        input->data_type(),
@@ -443,7 +432,6 @@
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
 
     // Output auto inizialitation if not yet initialized
-    // TODO(COMPMID-2078): input->clone()->set_tensor_shape(output_shape) doesn't work with subtensors for grouped direct convolutions (AlexNet).
     auto_init_if_empty(*output->info(),
                        output_shape,
                        1,

diff --git a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
index 5c74579..c8c7fb0 100644
--- a/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp
+++ b/src/core/CL/kernels/CLElementWiseUnaryLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,11 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "support/StringSupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output)
@@ -50,26 +51,22 @@
 }
 } // namespace
 
-void CLElementWiseUnaryLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op)
+void CLElementWiseUnaryLayerKernel::configure(const ITensorInfo *input, ITensorInfo *output, const ElementWiseUnary &op)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, op);
 }
 
-void CLElementWiseUnaryLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op)
+void CLElementWiseUnaryLayerKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const ElementWiseUnary &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
-
-    // Configure kernel window
-    _input  = input;
-    _output = output;
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input, *output));
 
     const std::string kernel_name    = "elementwise_unary";
-    const int         vec_size_x     = 16 / output->info()->element_size();
-    const int         output_width_x = output->info()->tensor_shape().x();
+    const int         vec_size_x     = 16 / output->element_size();
+    const int         output_width_x = output->tensor_shape().x();
     const bool        multi_access_x = (output_width_x / vec_size_x > 0);
 
-    Window win = calculate_max_window(*output->info());
+    Window win = calculate_max_window(*output);
     if(multi_access_x)
     {
         win.set(Window::DimX,
@@ -79,7 +76,7 @@
 
     // Set kernel build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
     switch(op)
@@ -122,7 +119,7 @@
     return Status{};
 }
 
-void CLElementWiseUnaryLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLElementWiseUnaryLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
@@ -130,12 +127,16 @@
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice);
-        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
index 00a97d5..ec33500 100644
--- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "support/StringSupport.h"
 #include <map>
 
@@ -93,9 +94,13 @@
 Status validate_arguments_with_arithmetic_rules(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
+                                                         DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
+                                                         DataType::S32, DataType::F32);
 
     const bool is_quantized = is_data_type_quantized(input1.data_type()) || is_data_type_quantized(input2.data_type());
     if(is_quantized)
@@ -119,7 +124,9 @@
     if(output.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&output);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                             DataType::S16, DataType::QSYMM16, DataType::F16,
+                                                             DataType::S32, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((output.data_type() == DataType::U8) && ((input1.data_type() != DataType::U8) || (input2.data_type() != DataType::U8)),
                                         "Output can only be U8 if both inputs are U8");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -235,18 +242,15 @@
 {
 }
 
-void CLElementwiseOperationKernel::configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLElementwiseOperationKernel::configure_common(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
     configure_common(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLElementwiseOperationKernel::configure_common(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLElementwiseOperationKernel::configure_common(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
-
     // Configure kernel window
-    auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+    auto win_config = validate_and_configure_window(*input1, *input2, *output);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     _input1 = input1;
@@ -254,13 +258,13 @@
     _output = output;
 
     std::string kernel_name = "elementwise_operation_" + name();
-    if(is_data_type_quantized(input1->info()->data_type()))
+    if(is_data_type_quantized(input1->data_type()))
     {
         kernel_name += "_quantized";
     }
 
     // Set kernel build options
-    CLBuildOptions build_opts = generate_build_options(*input1->info(), *input2->info(), *output->info());
+    CLBuildOptions build_opts = generate_build_options(*input1, *input2, *output);
     if(_act_info.enabled())
     {
         build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
@@ -273,17 +277,21 @@
 
     ICLKernel::configure_internal(win_config.second);
 
-    _config_id = generate_id_for_tuning(kernel_name, *input1->info(), *output->info());
+    _config_id = generate_id_for_tuning(kernel_name, *input1, *output);
 }
 
-void CLElementwiseOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLElementwiseOperationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-    const TensorShape &out_shape = _output->info()->tensor_shape();
+    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
+    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
+    const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
@@ -310,9 +318,9 @@
     {
         unsigned int idx = 0;
 
-        add_3D_tensor_argument(idx, _input1, slice_input1);
-        add_3D_tensor_argument(idx, _input2, slice_input2);
-        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, src_0, slice_input1);
+        add_3D_tensor_argument(idx, src_1, slice_input2);
+        add_3D_tensor_argument(idx, dst, slice);
 
         enqueue(queue, *this, slice, lws_hint());
 
@@ -324,23 +332,26 @@
 
 BorderSize CLElementwiseOperationKernel::border_size() const
 {
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0));
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
     return BorderSize{ 0, border, 0, 0 };
 }
 
 /** Arithmetic operations with saturation*/
 
-void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy,
+void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
                                                      const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), op, input1, input2, output, policy, act_info);
 }
 
-void CLSaturatedArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+void CLSaturatedArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
                                                      const ConvertPolicy       &policy,
                                                      const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLSaturatedArithmeticOperationKernel::validate(op, input1, input2, output, policy, act_info));
+
     _policy   = policy;
     _op       = op;
     _act_info = act_info;
@@ -364,11 +375,6 @@
     return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
 }
 
-Status CLSaturatedArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
-{
-    return validate_arguments_with_arithmetic_rules(input1, input2, output);
-}
-
 CLBuildOptions CLSaturatedArithmeticOperationKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
     const bool has_float_out = is_data_type_float(output.data_type());
@@ -391,14 +397,17 @@
 
 /** Arithmetic operations*/
 
-void CLArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticOperationKernel::configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), op, input1, input2, output, act_info);
 }
 
-void CLArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+void CLArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
                                             const ActivationLayerInfo &act_info)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(CLArithmeticOperationKernel::validate(op, input1, input2, output, act_info));
+
     _op       = op;
     _act_info = act_info;
     configure_common(compile_context, input1, input2, output);
@@ -434,18 +443,6 @@
         return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
     }
 }
-Status CLArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
-{
-    if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
-    {
-        // Division and Power operators don't support integer arithmetic
-        return validate_arguments_with_float_only_supported_rules(input1, input2, output);
-    }
-    else
-    {
-        return validate_arguments_with_arithmetic_rules(input1, input2, output);
-    }
-}
 
 CLBuildOptions CLArithmeticOperationKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {

diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
index 6cb5ffc..a5eb79f 100644
--- a/src/core/CL/kernels/CLErodeKernel.cpp
+++ b/src/core/CL/kernels/CLErodeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 03e6ee7..30bca2f 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 63c0939..6c36338 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index 4738a12..ac5f2b3 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
index 4c2086c..e71b472 100644
--- a/src/core/CL/kernels/CLFastCornersKernel.cpp
+++ b/src/core/CL/kernels/CLFastCornersKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index b2f06b3..1ea654b 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,15 +33,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "support/StringSupport.h"
 
-#include <cstdint>
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLFillBorderKernel::CLFillBorderKernel()
     : ICLKernel(), _tensor(nullptr)
 {
@@ -67,10 +63,16 @@
 
 void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
-    ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
+    _tensor = tensor;
+    configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value);
+}
 
-    border_size.limit(tensor->info()->padding());
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+    ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1);
+
+    border_size.limit(tensor->padding());
 
     // If there is no border: early exit
     if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
@@ -81,11 +83,11 @@
     // Select appropriate kernel
     std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
 
-    const DataType dt = tensor->info()->data_type();
+    const DataType dt = tensor->data_type();
 
     // Define build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
     build_opts.add_option("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top));
     build_opts.add_option("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom));
     build_opts.add_option("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left));
@@ -93,16 +95,15 @@
 
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-    _tensor = tensor;
 
     // Create static kernel arguments
-    const unsigned int valid_width  = tensor->info()->valid_region().shape[0];
-    const unsigned int valid_height = tensor->info()->valid_region().shape[1];
+    const unsigned int valid_width  = tensor->valid_region().shape[0];
+    const unsigned int valid_height = tensor->valid_region().shape[1];
     const cl_int2      valid_region_coords =
     {
         {
-            static_cast<cl_int>(tensor->info()->valid_region().anchor[0]),
-            static_cast<cl_int>(tensor->info()->valid_region().anchor[1]),
+            static_cast<cl_int>(tensor->valid_region().anchor[0]),
+            static_cast<cl_int>(tensor->valid_region().anchor[1]),
         }
     };
     const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
@@ -154,7 +155,7 @@
     Window win;
     win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
     win.set(Window::DimY, Window::Dimension(0, 1, 1));
-    win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ);
+    win.use_tensor_dimensions(tensor->tensor_shape(), Window::DimZ);
     ICLKernel::configure_internal(win);
 
     // Set config_id for enabling LWS tuning
@@ -162,13 +163,38 @@
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(dt));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(tensor->info()->dimension(0));
+    _config_id += support::cpp11::to_string(tensor->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(tensor->info()->dimension(1));
+    _config_id += support::cpp11::to_string(tensor->dimension(1));
     _config_id += "_";
     _config_id += lower_string(string_from_border_mode(border_mode));
 }
 
+void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    // Border mode undefined or border width == 0
+    if(_kernel() == nullptr)
+    {
+        return;
+    }
+
+    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, tensor, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+
 void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
@@ -191,3 +217,4 @@
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLFlattenLayerKernel.cpp b/src/core/CL/kernels/CLFlattenLayerKernel.cpp
index bf2c891..6bd1149 100644
--- a/src/core/CL/kernels/CLFlattenLayerKernel.cpp
+++ b/src/core/CL/kernels/CLFlattenLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLFloorKernel.cpp b/src/core/CL/kernels/CLFloorKernel.cpp
index 9b2133d..09f5f61 100644
--- a/src/core/CL/kernels/CLFloorKernel.cpp
+++ b/src/core/CL/kernels/CLFloorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index fd03e83..b582295 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
index 663cc70..9a2918d 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,7 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
@@ -70,6 +70,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m();
     const int n = gemm_info.n();

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
index eeedfda..56b92a3 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,6 +65,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m();
     const int n = gemm_info.n();

diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
index 0fdc899..4770329 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,7 +62,7 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
@@ -74,6 +74,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m;
     const int n = gemm_info.n;
@@ -320,7 +321,8 @@
     configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts);
 }
 
-void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info,
+void CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
+                                                              const GEMMKernelInfo &gemm_info,
                                                               const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias,
                                                               const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
 {

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index dc8eb76..6ef9fd2 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
index 26b318b..6d3aa6f 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index f9f4839..242d151 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 2db7d6d..55e4ed2 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
index 2306b00..c98f5bf 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index b4a7cc9..fa78410 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 3158d59..9233574 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
index 44f8797..31a97ca 100644
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@
 Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
     if(output->total_size() > 0)
     {

diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
deleted file mode 100644
index 03cd187..0000000
--- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
+++ /dev/null

@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(accum);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
-    ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1);
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target,
-                                                        unsigned int &num_elems_processed_per_iteration)
-{
-    // Select the vector size to use (8 for Bifrost; 16 for Midgard).
-    bool is_gpu_bifrost = gpu_target_is_in(gpu_target,
-                                           GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
-                                           GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
-                                           GPUTarget::G52, GPUTarget::G52LIT);
-    num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowStatic     biases_access(biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), biases->dimension(1));
-    AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, biases_access, accum_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
-    : _accum(nullptr), _biases(nullptr)
-{
-}
-
-void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), accum, biases);
-}
-
-void CLGEMMMatrixAccumulateBiasesKernel::configure(const CLCompileContext &compile_context, ICLTensor *accum, const ICLTensor *biases)
-{
-    // Perform validate step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
-
-    _biases = biases;
-    _accum  = accum;
-
-    // Get the target gpu
-    GPUTarget    gpu_target  = get_target();
-    unsigned int vector_size = 0;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
-
-    // Add build options
-    CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type()));
-    build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-
-    // Create kernel
-    _kernel = create_kernel(compile_context, "gemm_accumulate_biases", build_opts.options());
-}
-
-Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target)
-{
-    unsigned int num_elems_processed_per_iteration = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), biases->clone().get(), gpu_target, num_elems_processed_per_iteration).first);
-
-    return Status{};
-}
-
-void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-    Window accum_slice = window.first_slice_window_2D();
-
-    Window biases_slice(accum_slice);
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    // Run kernel
-    do
-    {
-        // Set arguments
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _accum, accum_slice);
-        add_1D_tensor_argument(idx, _biases, biases_slice);
-
-        enqueue(queue, *this, accum_slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(accum_slice));
-}

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index d2c7954..c2dd92c 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
index d5a5284..da57aa4 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,6 +69,7 @@
                                     && (!gemm_info.broadcast_bias),
                                     "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -154,33 +155,26 @@
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-    const unsigned int m          = reinterpret_output_as_3d ? gemm_info.m : output->dimension(1);
-    const unsigned int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
     win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     AccessWindowStatic input0_access(input0, 0, 0,
                                      input0->dimension(0),
-                                     input0->dimension(1) + bottom_pad);
+                                     input0->dimension(1));
     AccessWindowStatic input1_access(input1, 0, 0,
                                      ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
                                      input1->dimension(1));
     AccessWindowStatic output_access(output, 0, 0,
-                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-                                     output->dimension(1) + bottom_pad);
+                                     output->dimension(0),
+                                     output->dimension(1));
 
     if(input2 != nullptr)
     {
         const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
 
-        const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y;
-
         AccessWindowStatic input2_access(input2, 0, 0,
                                          ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
-                                         ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
+                                         input2->dimension(1));
 
         window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
                          update_window_and_padding(win_out, output_access);                             // window used to update the padding requirements of output tensor
@@ -263,6 +257,14 @@
     const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
     const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
 
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
@@ -279,9 +281,11 @@
     build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
     build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
     build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
index 09e4e98..8f20de1 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -38,6 +39,7 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLUtils.h"
 #include "support/StringSupport.h"
 
 #include <cstddef>
@@ -78,6 +80,7 @@
                                     && (!gemm_info.broadcast_bias),
                                     "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (input0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
+    ARM_COMPUTE_RETURN_ON_ERROR(cl_gemm::validate_image2d_support_on_rhs(*input1, rhs_info));
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -156,23 +159,18 @@
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-    const unsigned int m          = gemm_info.m;
-    const unsigned int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
     win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     AccessWindowStatic input0_access(input0, 0, 0,
-                                     ceil_to_multiple(input0->dimension(0), num_elems_processed_per_iteration_y),
+                                     input0->dimension(0),
                                      input0->dimension(1));
     AccessWindowStatic input1_access(input1, 0, 0,
-                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                     input1->dimension(0),
                                      input1->dimension(1));
     AccessWindowStatic output_access(output, 0, 0,
-                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-                                     output->dimension(1) + bottom_pad);
+                                     output->dimension(0),
+                                     output->dimension(1));
 
     if(input2 != nullptr)
     {
@@ -207,8 +205,8 @@
 } // namespace
 
 CLGEMMMatrixMultiplyReshapedKernel::CLGEMMMatrixMultiplyReshapedKernel()
-    : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _k(1), _use_dummy_work_items(false), _add_bias(false),
-      _broadcast_bias(false)
+    : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_output_as_3d(false), _use_dummy_work_items(false), _add_bias(false),
+      _broadcast_bias(false), _export_to_cl_image(false), _k(1)
 {
 }
 
@@ -233,10 +231,11 @@
     _input2                   = helpers::float_ops::is_zero(beta) ? nullptr : input2;
     _output                   = output;
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
-    _k                        = gemm_info.k;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = _input2 != nullptr;
     _broadcast_bias           = gemm_info.broadcast_bias;
+    _export_to_cl_image       = rhs_info.export_to_cl_image;
+    _k                        = gemm_info.k;
 
     // Check if we need to slide the matrix B
     const unsigned int num_dimensions_input0 = _input0->info()->num_dimensions();
@@ -252,6 +251,12 @@
     const bool     enable_mixed_precision = gemm_info.fp_mixed_precision;
     const DataType data_type              = input0->info()->data_type();
 
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : output->info()->dimension(1);
+
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
@@ -270,19 +275,25 @@
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
     build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION");
+    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
+    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(input1->info()->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
     build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m));
     build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
     build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
 
     std::string kernel_name("gemm_mm_reshaped_");
     kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
     kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
+    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
 
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -356,20 +367,16 @@
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_output_as_3d)
+    const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
+
+    cl::Image2D input1_image2d;
+
+    if(_export_to_cl_image)
     {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 5;
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 4;
-        }
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+        const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2));
+        const size_t      image_row_pitch = _input1->info()->strides_in_bytes()[1];
+
+        input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, CL_FLOAT, image_row_pitch);
     }
 
     do
@@ -383,18 +390,51 @@
         }
 
         unsigned int idx = 0;
+
+        // LHS buffer
         add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument_if((_add_bias), idx, _input2, slice);
+
+        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
+        if(_export_to_cl_image)
+        {
+            _kernel.setArg(idx++, input1_image2d);
+        }
+        else
+        {
+            add_2D_tensor_argument(idx, _input1, slice_b);
+        }
+
+        // Bias buffer (_add_bias == true)
+        add_2D_tensor_argument_if(_add_bias, idx, _input2, slice);
+
+        // Output buffer
         add_2D_tensor_argument(idx, _output, slice);
+
+        // K dimension (not used if _export_to_cl_image == true)
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
+
+        // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+
+        // RHS stride_z (not used if _export_to_cl_image == true)
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+
+        // Bias stride_z (if _add_bias == true)
         if(_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
         }
+
+        // Output stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+
+        // Cross-plan padding (if _reinterpret_output_as_3d = true)
+        if(_reinterpret_output_as_3d)
+        {
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
+        }
+
+        // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
     while(window.slide_window_slice_3D(slice));

diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
index 8e194d5..cf77c70 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,12 +26,14 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/float_ops.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CL/CLUtils.h"
 #include "support/StringSupport.h"
 
 #include <tuple>
@@ -64,6 +66,7 @@
                                     && (!gemm_info.broadcast_bias),
                                     "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
+    ARM_COMPUTE_RETURN_ON_ERROR(cl_gemm::validate_image2d_support_on_rhs(*input1, rhs_info));
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
@@ -152,33 +155,26 @@
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-    const unsigned int m          = reinterpret_output_as_3d ? gemm_info.m : output->dimension(1);
-    const unsigned int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
     win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     AccessWindowStatic input0_access(input0, 0, 0,
                                      input0->dimension(0),
-                                     input0->dimension(1) + bottom_pad);
+                                     input0->dimension(1));
     AccessWindowStatic input1_access(input1, 0, 0,
                                      input1->dimension(0),
                                      input1->dimension(1));
     AccessWindowStatic output_access(output, 0, 0,
-                                     ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-                                     output->dimension(1) + bottom_pad);
+                                     output->dimension(0),
+                                     output->dimension(1));
 
     if(input2 != nullptr)
     {
         const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
 
-        const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y;
-
         AccessWindowStatic input2_access(input2, 0, 0,
                                          ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
-                                         ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
+                                         input2->dimension(1));
 
         window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
                          update_window_and_padding(win_out, output_access);                             // window used to update the padding requirements of output tensor
@@ -204,7 +200,7 @@
 
 CLGEMMMatrixMultiplyReshapedOnlyRHSKernel::CLGEMMMatrixMultiplyReshapedOnlyRHSKernel()
     : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _use_dummy_work_items(false),
-      _add_bias(false), _broadcast_bias(false)
+      _add_bias(false), _broadcast_bias(false), _export_to_cl_image(false)
 {
 }
 
@@ -234,6 +230,7 @@
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = _input2 != nullptr;
     _broadcast_bias           = gemm_info.broadcast_bias;
+    _export_to_cl_image       = rhs_info.export_to_cl_image;
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -262,6 +259,14 @@
     const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
     const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
 
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
@@ -276,19 +281,24 @@
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
+    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(input1->info()->dimension(1)));
     build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
     build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
     build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
+    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
 
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -358,36 +368,17 @@
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
-    {
-        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 4;
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 3;
-        }
-        const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-    }
+    const unsigned int total_cross_plane_pad_lhs = _input0->info()->padding().top + _input0->info()->padding().bottom;
+    const unsigned int total_cross_plane_pad_out = _output->info()->padding().top + _output->info()->padding().bottom;
 
-    if(_reinterpret_output_as_3d)
+    cl::Image2D input1_image2d;
+
+    if(_export_to_cl_image)
     {
-        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        unsigned int idx0;
-        if(_add_bias)
-        {
-            idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0);
-        }
-        else
-        {
-            idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-        }
-        const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
-        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+        const TensorShape shape2d(_input1->info()->dimension(0) / 4, _input1->info()->dimension(1) * _input1->info()->dimension(2));
+        const size_t      image_row_pitch = _input1->info()->strides_in_bytes()[1];
+
+        input1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input1->cl_buffer(), shape2d, CL_FLOAT, image_row_pitch);
     }
 
     do
@@ -401,17 +392,53 @@
         }
 
         unsigned int idx = 0;
+
+        // LHS buffer
         add_2D_tensor_argument(idx, _input0, slice);
-        add_2D_tensor_argument(idx, _input1, slice_b);
-        add_2D_tensor_argument_if((_add_bias), idx, _input2, slice);
+
+        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
+        if(_export_to_cl_image)
+        {
+            _kernel.setArg(idx++, input1_image2d);
+        }
+        else
+        {
+            add_2D_tensor_argument(idx, _input1, slice_b);
+        }
+
+        // Bias buffer (_add_bias == true)
+        add_2D_tensor_argument_if(_add_bias, idx, _input2, slice);
+
+        // Output buffer
         add_2D_tensor_argument(idx, _output, slice);
+
+        // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
+
+        // RHS stride_z (not used if _export_to_cl_image == true)
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
+
+        // Bias stride_z (if _add_bias == true)
         if(_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
         }
+
+        // Output stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
+
+        // Cross-plan padding (if _reinterpret_input_as_3d = true)
+        if(_reinterpret_input_as_3d)
+        {
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
+        }
+
+        // Cross-plan padding (if _reinterpret_output_as_3d = true)
+        if(_reinterpret_output_as_3d)
+        {
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
+        }
+
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
     while(window.slide_window_slice_3D(slice));

diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
index 4e57259..f523845 100644
--- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,7 @@
 Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
     ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));

diff --git a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
index 3267a0e..156a657 100644
--- a/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,17 +88,12 @@
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*input, lhs_info, reinterpret_input_as_3d)));
 
     // Configure window
-    // Note: bottom paddings are calculated manually as the input can be reinterpreted as 3D tensor
-    // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
-    const int m          = reinterpret_input_as_3d ? input->tensor_shape()[1] * input->tensor_shape()[2] : input->tensor_shape()[1];
-    const int bottom_pad = ceil_to_multiple(m, num_elems_processed_per_iteration_y) - m;
-
     Window win    = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     Window win_in = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     AccessWindowStatic input_access(input, 0, 0,
-                                    ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration_x),
-                                    input->dimension(1) + bottom_pad);
+                                    input->dimension(0),
+                                    input->dimension(1));
     AccessWindowStatic output_access(output, 0, 0, output->dimension(0), output->dimension(1));
 
     window_changed = update_window_and_padding(win_in, input_access) || // window used by the execute_window_loop
@@ -135,17 +130,25 @@
     _output                  = output;
     _reinterpret_input_as_3d = reinterpret_input_as_3d;
 
+    const unsigned int src_w           = input->info()->dimension(0);
+    const unsigned int src_h           = _reinterpret_input_as_3d ? input->info()->dimension(1) * input->info()->dimension(2) : input->info()->dimension(1);
+    const unsigned int partial_load_m0 = src_h % lhs_info.m0;
+    const unsigned int partial_load_k0 = src_w % lhs_info.k0;
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
     build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
-    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_w));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_h));
     build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
+    build_opts.add_option("-DPARTIAL_LOAD_M0=" + support::cpp11::to_string(partial_load_m0));
+    build_opts.add_option("-DPARTIAL_LOAD_K0=" + support::cpp11::to_string(partial_load_k0));
 
     std::string kernel_name("gemm_reshape_lhs_matrix_");
     kernel_name += lhs_info.transpose ? "t" : "nt";

diff --git a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
index 4217932..c1993b7 100644
--- a/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -55,6 +56,12 @@
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
 
+    if(rhs_info.export_to_cl_image)
+    {
+        const TensorInfo tensor_reshaped_info(compute_rhs_reshaped_shape(*input, rhs_info), 1, DataType::F32);
+        ARM_COMPUTE_RETURN_ON_ERROR(cl_gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
+    }
+
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
@@ -86,6 +93,11 @@
     window_changed = update_window_and_padding(win, input_access);
     output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
 
+    if(rhs_info.export_to_cl_image)
+    {
+        arm_compute::cl_gemm::update_padding_for_cl_image(output);
+    }
+
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
     Window collapsed = win.collapse(win, Window::DimZ);

diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index 2cb8f23..57759fc 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
index 210ffb9..08e7e27 100644
--- a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
index cb86467..5b3639f 100644
--- a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
index 73dbda2..0e20187 100644
--- a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 8baac18..3108ad8 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
index e58b62e..7f618b2 100644
--- a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
index bee9744..fbd2208 100644
--- a/src/core/CL/kernels/CLHOGDetectorKernel.cpp
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
index 313d95f..08e670f 100644
--- a/src/core/CL/kernels/CLHarrisCornersKernel.cpp
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
index 5c0eb2a..22b2cfc 100644
--- a/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLHeightConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,30 +27,25 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "support/StringSupport.h"
 
-#include <map>
-
 namespace arm_compute
 {
 namespace
 {
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int height_offset, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int &num_elems_processed_per_iteration)
 {
     num_elems_processed_per_iteration = 4;
     // The window needs to be based on input as we copy all the heights of input
     Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, height_offset, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access, output_access);
 
     Window win_collapsed = win.collapse(win, Window::DimZ);
@@ -77,7 +72,7 @@
 } // namespace
 
 CLHeightConcatenateLayerKernel::CLHeightConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _height_offset(0), _num_elems_processed_per_iteration()
+    : _height_offset(0), _num_elems_processed_per_iteration()
 {
 }
 
@@ -85,37 +80,30 @@
 {
     unsigned int num_elems_processed_per_iteration;
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, height_offset, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), height_offset, output->clone().get(), num_elems_processed_per_iteration).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration).first);
     return Status{};
 }
 
-void CLHeightConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int height_offset, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, height_offset, output);
-}
-
-void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int height_offset, ICLTensor *output)
+void CLHeightConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int height_offset, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), height_offset, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, height_offset, output));
 
-    _input         = input;
-    _output        = output;
     _height_offset = height_offset;
 
-    auto win_config = validate_and_configure_window(input->info(), height_offset, output->info(), _num_elems_processed_per_iteration);
+    auto win_config = validate_and_configure_window(input, output, _num_elems_processed_per_iteration);
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->element_size()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
     build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2)));
 
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
     {
-        const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
+        const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = output->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
@@ -132,17 +120,20 @@
     ICLKernel::configure_internal(std::get<1>(win_config));
 
     // Set output valid region
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 }
 
-void CLHeightConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLHeightConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
     unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, window);
-    add_4D_tensor_argument(idx, _output, window);
+    add_4D_tensor_argument(idx, src, window);
+    add_4D_tensor_argument(idx, dst, window);
     enqueue(queue, *this, window, lws_hint());
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
index f16fa8c..b8a4e86 100644
--- a/src/core/CL/kernels/CLHistogramKernel.cpp
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 078aad2..c94e313 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,6 +70,13 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
     ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(channel_idx) % num_groups) != 0);
 
+    // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
+    const unsigned int width_idx    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const unsigned     total_width  = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
+    const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
+    ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
+
     if(output->total_size() > 0)
     {
         const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
@@ -106,12 +113,12 @@
         win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
         const int xin_start = 0;
-        const int xin_end   = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration) : input->dimension(0);
+        const int xin_end   = input->dimension(0);
         const int yin_start = 0;
         const int yin_end   = input->dimension(1);
 
         const int xout_start = 0;
-        const int xout_end   = input->dimension(0) < num_elems_processed_per_iteration ? output->dimension(0) + (num_elems_processed_per_iteration - input->dimension(0)) : output->dimension(0);
+        const int xout_end   = output->dimension(0);
         const int yout_start = 0;
         const int yout_end   = output->dimension(1);
 
@@ -140,7 +147,6 @@
             win = calculate_max_window(*input, Steps());
         }
     }
-
     output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
     // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
     win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
@@ -192,7 +198,7 @@
 
     if(data_layout == DataLayout::NHWC)
     {
-        num_elems_processed_per_iteration = 2;
+        num_elems_processed_per_iteration = std::min(2U, input_channel);
         is_padding_required_nchw          = false;
 
         // Only the 3x3 and 9x9 cases are optimized for NHWC
@@ -205,8 +211,14 @@
             kernel_name = "im2col9x9_";
         }
 
-        build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-        build_opts.add_option("-DLAST_ACCESSED=" + support::cpp11::to_string(std::max(static_cast<int>(input_channel - num_elems_processed_per_iteration), 0)));
+        // Get boundary vector (the first/last vector with potentially a partial vector size) size
+        // If input_channel is a multiple of num_elems_processed_per_iteration, the boundary vec size is the (full) vector size
+        // otherwise, the boundary vec size is the (partial) remainder vector size
+        const unsigned int vec_size          = num_elems_processed_per_iteration;
+        const unsigned int partial_vec_size  = input_channel % vec_size;
+        const unsigned int boundary_vec_size = vec_size - ((vec_size - partial_vec_size) % vec_size);
+        build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vec_size));
+        build_opts.add_option("-DBOUNDARY_VECTOR_SIZE=" + support::cpp11::to_string(boundary_vec_size));
     }
     else
     {

diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index 0eb2c50..2ad5233 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
index 4c3445d..aff4bd9 100644
--- a/src/core/CL/kernels/CLIntegralImageKernel.cpp
+++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index e04950d..a68d8db 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
index a2948d3..fae5fe2 100644
--- a/src/core/CL/kernels/CLLKTrackerKernel.cpp
+++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
index 04ad754..0da0d4c 100644
--- a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,22 +25,16 @@
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 
-#include <set>
-#include <sstream>
-#include <string>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
 CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr)
 {
@@ -52,9 +46,7 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
 
@@ -152,3 +144,4 @@
     }
     while(window.slide_window_slice_2D(slice));
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
index 88c1034..ef8ebd5 100644
--- a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
+++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
new file mode 100644
index 0000000..08c7464
--- /dev/null
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp

@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
+
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    PoolingType         pool_type       = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int    pool_size_x = pool_info.pool_size.width;
+    const int    pool_size_y = pool_info.pool_size.height;
+    const Size2D pool_size(pool_size_x, pool_size_y);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel()
+    : _input(nullptr), _output(nullptr), _indices(nullptr)
+{
+}
+
+void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
+
+    _input   = input;
+    _output  = output;
+    _indices = indices;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()));
+    build_opts.add_option("-DWIDTH_DST=" + support::cpp11::to_string(output->info()->dimension(0)));
+    build_opts.add_option("-DHEIGHT_DST=" + support::cpp11::to_string(output->info()->dimension(1)));
+    build_opts.add_option("-DDEPTH_DST=" + support::cpp11::to_string(output->info()->dimension(2)));
+
+    const std::string kernel_name("max_unpooling_layer_2");
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    const TensorShape output_shape = compute_unpool_shape(*input->info(), pool_info);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    auto window = calculate_max_window(*input->info(), Steps());
+    ICLKernel::configure_internal(window);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(3));
+}
+
+Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
+    return Status{};
+}
+
+void CLMaxUnpoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, _indices, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(window.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
index de8b57e..33099c9 100644
--- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index 4230570..5ecbb4b 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
index 3b1b6ad..5f8c9e5 100644
--- a/src/core/CL/kernels/CLMedian3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLMemsetKernel.cpp b/src/core/CL/kernels/CLMemsetKernel.cpp
index 992be0a..f591c2f 100644
--- a/src/core/CL/kernels/CLMemsetKernel.cpp
+++ b/src/core/CL/kernels/CLMemsetKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
index 2ff9196..5f0e48d 100644
--- a/src/core/CL/kernels/CLMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
index dfa0555..9bbda40 100644
--- a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
index 5066c3b..16e5113 100644
--- a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
+++ b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
index 7de7735..958d94c 100644
--- a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index 7c8c232..7d8e5db 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 2ca7716..00bdac3 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 82508ec..c05df61 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/StringSupport.h"
 

diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index e657c4e..1636e5a 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
index 585715a..229937e 100644
--- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -142,21 +143,21 @@
 {
 }
 
-void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+void CLPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
                                                 ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void CLPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+void CLPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
                                                 ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(),
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output,
                                                   scale, overflow_policy, rounding_policy, act_info));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    auto win_config = validate_and_configure_window(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     _input1 = input1;
@@ -179,14 +180,14 @@
 
     std::string acc_type;
     // Check if it has float inputs and output
-    if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
+    if(is_data_type_float(input1->data_type()) || is_data_type_float(input2->data_type()))
     {
         scale_int = -1;
-        acc_type  = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half";
+        acc_type  = (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) ? "float" : "half";
     }
     else
     {
-        if(input1->info()->element_size() == 2 || input2->info()->element_size() == 2)
+        if(input1->element_size() == 2 || input2->element_size() == 2)
         {
             // Use 32-bit accumulator for 16-bit input
             acc_type = "int";
@@ -198,26 +199,26 @@
         }
     }
 
-    const bool is_quantized = is_data_type_quantized(input1->info()->data_type());
+    const bool is_quantized = is_data_type_quantized(input1->data_type());
 
     // Set kernel build options
     std::string    kernel_name = "pixelwise_mul";
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
-    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->data_type()));
+    build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->data_type()));
+    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    if(is_quantized && (output->info()->data_type() != DataType::S32))
+    if(is_quantized && (output->data_type() != DataType::S32))
     {
-        const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = output->info()->quantization_info().uniform();
+        const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform();
+        const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info  = output->quantization_info().uniform();
 
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(input1->info()->data_type()),
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(input1->data_type()),
                                  "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(input2->info()->data_type()),
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(input2->data_type()),
                                  "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(output->info()->data_type()),
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(output->data_type()),
                                  "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset));
         build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
         build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
@@ -227,7 +228,7 @@
     else
     {
         kernel_name += (scale_int >= 0) ? "_int" : "_float";
-        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type()), "-DWRAP", "-DSATURATE");
+        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->data_type()), "-DWRAP", "-DSATURATE");
         build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
         build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
         if(act_info.enabled())
@@ -266,14 +267,18 @@
     return Status{};
 }
 
-void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-    const TensorShape &out_shape = _output->info()->tensor_shape();
+    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
+    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
+    const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool can_collapse = true;
     if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
@@ -298,9 +303,9 @@
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input1, slice_input1);
-        add_3D_tensor_argument(idx, _input2, slice_input2);
-        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, src_0, slice_input1);
+        add_3D_tensor_argument(idx, src_1, slice_input2);
+        add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
@@ -311,7 +316,7 @@
 
 BorderSize CLPixelWiseMultiplicationKernel::border_size() const
 {
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0));
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
     return BorderSize{ 0, border, 0, 0 };
 }
@@ -374,18 +379,18 @@
 {
 }
 
-void CLComplexPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplicationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info(), act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output, act_info));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info());
+    auto win_config = validate_and_configure_window_complex(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     _input1 = input1;
@@ -415,14 +420,18 @@
     return Status{};
 }
 
-void CLComplexPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-    const TensorShape &out_shape = _output->info()->tensor_shape();
+    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
+    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
+    const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool can_collapse = true;
     if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
@@ -447,9 +456,9 @@
     do
     {
         unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input1, slice_input1);
-        add_3D_tensor_argument(idx, _input2, slice_input2);
-        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, src_0, slice_input1);
+        add_3D_tensor_argument(idx, src_1, slice_input2);
+        add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
@@ -460,7 +469,7 @@
 
 BorderSize CLComplexPixelWiseMultiplicationKernel::border_size() const
 {
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0));
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
     return BorderSize{ 0, border, 0, 0 };
 }

diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index cf1d7dd..d60e196 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,13 +60,20 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices, "Indices not supported in the CL backend.");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2),
                                     "Unsupported combination of parameters!");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(input->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
                                     && (input->data_layout() == DataLayout::NHWC),
                                     "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+    // Check indices
+    if(indices)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+    }
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -80,7 +87,7 @@
     return Status{};
 }
 
-std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info)
+std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -140,7 +147,19 @@
             AccessWindowRectangle input_access(input, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
                                                pool_stride_x, pool_stride_y);
             AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, input_access, output_access);
+
+            // Update indices window
+            if(indices)
+            {
+                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
+                window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
+                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
+            }
+            else
+            {
+                window_changed = update_window_and_padding(win, input_access, output_access);
+            }
+
             output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
             break;
         }
@@ -153,7 +172,19 @@
                                             0, -1,
                                             ceil_to_multiple(input->dimension(0), num_elems_processed_per_iteration), input->dimension(1));
             AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-            window_changed = update_window_and_padding(win, input_access, output_access);
+
+            // Update indices window
+            if(indices)
+            {
+                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
+                window_changed = update_window_and_padding(win, input_access, output_access, indices_access);
+                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
+            }
+            else
+            {
+                window_changed = update_window_and_padding(win, input_access, output_access);
+            }
+
             output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
             break;
         }
@@ -207,8 +238,39 @@
 
     // Set build options
     CLBuildOptions build_opts;
+    const DataType data_type = input->info()->data_type();
 
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info, (indices ? indices->info() : nullptr));
+
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+    ICLKernel::configure_internal(std::get<1>(win_config));
+
+    if(_data_layout == DataLayout::NCHW)
+    {
+        CLPoolingConfig pooling_config     = std::get<2>(win_config);
+        _num_elems_processed_per_iteration = pooling_config.first;
+        _border_size                       = pooling_config.second;
+    }
+    else
+    {
+        _border_size                       = BorderSize(1, 0, 0, 0);
+        _num_elems_processed_per_iteration = 8;
+    }
+
+    // Tensor paddings are used to calculate the indicies for MAX pooling
+    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
+    {
+        build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(input->info()->padding().left));
+        build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(input->info()->padding().right));
+        build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(input->info()->padding().top));
+        build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(input->info()->padding().bottom));
+        build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
+        build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+        build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+    }
+
+    if(is_data_type_quantized_asymmetric(data_type) && input->info()->quantization_info() != output->info()->quantization_info())
     {
         const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
@@ -223,8 +285,6 @@
     auto_init(input->info(), output->info(), pool_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr));
 
-    const DataType data_type = input->info()->data_type();
-
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
     build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
@@ -282,6 +342,20 @@
                                           + support::cpp11::to_string(pool_size_x);
                 _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
             }
+            else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
+            {
+                // For max pooling with pool2x2, store indicies which will be used in max unpooling
+                if(data_type == DataType::F32)
+                {
+                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp32";
+                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+                }
+                else if(data_type == DataType::F16)
+                {
+                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp16";
+                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+                }
+            }
             else // Run general case
             {
                 std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
@@ -296,32 +370,33 @@
             build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
             build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
                                      "-DDST_DEPTH=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
-            std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
-            _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            build_opts.add_option_if(output->info()->tensor_shape().total_size_upper(3) > 1,
+                                     "-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->tensor_shape().total_size_upper(3)));
+
+            if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && _indices && is_data_type_float(data_type))
+            {
+                if(data_type == DataType::F32)
+                {
+                    std::string kernel_name = "pooling_layer_2_nhwc_indices_fp32";
+                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+                }
+                else if(data_type == DataType::F16)
+                {
+                    std::string kernel_name = "pooling_layer_2_nhwc_indices_fp16";
+                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+                }
+            }
+            else
+            {
+                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            }
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Not implemented");
     }
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info);
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-    ICLKernel::configure_internal(std::get<1>(win_config));
-
-    if(_data_layout == DataLayout::NCHW)
-    {
-        CLPoolingConfig pooling_config     = std::get<2>(win_config);
-        _num_elems_processed_per_iteration = pooling_config.first;
-        _border_size                       = pooling_config.second;
-    }
-    else
-    {
-        _border_size                       = BorderSize(1, 0, 0, 0);
-        _num_elems_processed_per_iteration = 8;
-    }
-
     // Set config_id for enabling LWS tuning
     _config_id = "pooling_layer_";
     _config_id += lower_string(string_from_data_type(data_type));
@@ -377,6 +452,10 @@
                 unsigned int idx = 0;
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, slice);
+                if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                {
+                    add_3D_tensor_argument(idx, _indices, slice);
+                }
                 enqueue(queue, *this, slice, lws_hint());
             }
             while(window_collapsed.slide_window_slice_3D(slice));
@@ -398,6 +477,10 @@
                 unsigned int idx = 0;
                 add_4D_tensor_argument(idx, _input, in_slice);
                 add_4D_tensor_argument(idx, _output, slice);
+                if(_indices && is_data_type_float(_input->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                {
+                    add_4D_tensor_argument(idx, _indices, slice);
+                }
                 enqueue(queue, *this, slice, lws_hint());
             }
             while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));

diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index 07f669a..3429ef7 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index d9da3cb..2f676d3 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
index b4b2217..f6b0888 100644
--- a/src/core/CL/kernels/CLQuantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLQuantizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index de99223..3f2a904 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index a5b80eb..c2ed326 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index 1e97649..d46cdd7 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 33e7144..0ba63cc 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
index dcc425b..fe8c81a 100644
--- a/src/core/CL/kernels/CLRemapKernel.cpp
+++ b/src/core/CL/kernels/CLRemapKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index 065e25e..ab81a8f 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLReshapeLayerKernel.cpp b/src/core/CL/kernels/CLReshapeLayerKernel.cpp
index ce79248..3daf21a 100644
--- a/src/core/CL/kernels/CLReshapeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReshapeLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,12 +34,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 
 #include <string>
 
 /** [CLReshapeLayerKernel Kernel] **/
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
@@ -54,44 +55,30 @@
 
     return Status{};
 }
-
 } // namespace
 
-CLReshapeLayerKernel::CLReshapeLayerKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLReshapeLayerKernel::configure(const ICLTensor *input, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output);
-}
-
-void CLReshapeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+void CLReshapeLayerKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output));
 
     // Create kernel
-    std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size()) };
+    std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->element_size()) };
     _kernel                          = create_kernel(compile_context, "reshape_layer", build_opts);
 
     // Add static arguments
     const cl_int2 input_shape =
     {
         {
-            static_cast<cl_int>(_input->info()->tensor_shape()[0]),
-            static_cast<cl_int>(_input->info()->tensor_shape()[1])
+            static_cast<cl_int>(input->tensor_shape()[0]),
+            static_cast<cl_int>(input->tensor_shape()[1])
         }
     };
     const cl_int2 output_shape =
     {
         {
-            static_cast<cl_int>(_output->info()->tensor_shape()[0]),
-            static_cast<cl_int>(_output->info()->tensor_shape()[1])
+            static_cast<cl_int>(output->tensor_shape()[0]),
+            static_cast<cl_int>(output->tensor_shape()[1])
         }
     };
     unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
@@ -99,10 +86,10 @@
     _kernel.setArg<cl_int2>(idx++, output_shape);
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info());
+    Window win = calculate_max_window(*input);
 
     // Set the output valid region
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
     ICLKernel::configure_internal(win);
 }
 
@@ -113,7 +100,7 @@
     return Status{};
 }
 
-void CLReshapeLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLReshapeLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
@@ -121,10 +108,14 @@
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_3D();
 
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
     // Set inputs
     unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, window_collapsed);
-    add_3D_tensor_argument(idx, _output, window_collapsed);
+    add_3D_tensor_argument(idx, src, window_collapsed);
+    add_3D_tensor_argument(idx, dst, window_collapsed);
     enqueue(queue, *this, slice, lws_hint());
 }
+} // namespace arm_compute
 /** [CLReshapeLayerKernel Kernel] **/

diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index d2a3809..6546ced 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index f41664f..2e7ee36 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "support/StringSupport.h"
 
+#include "src/core/utils/ScaleUtils.h"
+
 #include <set>
 #include <string>
 
@@ -54,13 +56,13 @@
     const unsigned int output_width  = output.dimension(idx_width);
     const unsigned int output_height = output.dimension(idx_height);
 
-    float wr = arm_compute::calculate_resize_ratio(input_width, output_width, align_corners);
-    float hr = arm_compute::calculate_resize_ratio(input_height, output_height, align_corners);
+    float wr = arm_compute::scale_utils::calculate_resize_ratio(input_width, output_width, align_corners);
+    float hr = arm_compute::scale_utils::calculate_resize_ratio(input_height, output_height, align_corners);
 
     return std::make_pair(wr, hr);
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, bool align_corners)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
@@ -68,35 +70,20 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output == input);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
 
-    if(align_corners)
-    {
-        // For bilinear method with aligned corners, the resize ratio will
-        // be calculated by (input_size - 1)/(output_size - 1). Belows are
-        // checking possible overflows.
-        const auto data_layout  = input->data_layout();
-        const auto width_index  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const auto height_index = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-        const auto input_width   = input->dimension(width_index);
-        const auto input_height  = input->dimension(height_index);
-        const auto output_width  = output->dimension(width_index);
-        const auto output_height = output->dimension(height_index);
-
-        ARM_COMPUTE_RETURN_ERROR_ON(input_width == 0 || input_height == 0 || output_width == 0 || output_height == 0);
-        ARM_COMPUTE_RETURN_ERROR_ON((output_width - 1 == 0) || (output_height - 1 == 0));
-    }
+    const bool will_use_align_corners = info.align_corners;
 
     float wr = 0.f;
     float hr = 0.f;
-    std::tie(wr, hr) = calculate_scale_factors(*input, *output, align_corners);
+    std::tie(wr, hr) = calculate_scale_factors(*input, *output, will_use_align_corners);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(policy == InterpolationPolicy::AREA && (wr > 1.f || hr > 1.f));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (wr > 1.f || hr > 1.f));
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy, BorderSize &border)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const ScaleKernelInfo &info, BorderSize &border)
 {
     Window       win{};
     bool         window_changed{};
@@ -107,7 +94,7 @@
     {
         case DataLayout::NCHW:
         {
-            if(border_mode == BorderMode::UNDEFINED)
+            if(info.border_mode == BorderMode::UNDEFINED)
             {
                 border = BorderSize(0);
             }
@@ -123,9 +110,9 @@
 
             output_access.set_valid_region(win, calculate_valid_region_scale(*(input),
                                                                              output->tensor_shape(),
-                                                                             policy,
-                                                                             sampling_policy,
-                                                                             border_mode == BorderMode::UNDEFINED));
+                                                                             info.interpolation_policy,
+                                                                             info.sampling_policy,
+                                                                             info.border_mode == BorderMode::UNDEFINED));
 
             window_changed = update_window_and_padding(win, input_access, output_access);
         }
@@ -157,15 +144,12 @@
     return BorderSize(1);
 }
 
-Status CLScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
-                               BorderMode border_mode, SamplingPolicy sampling_policy, bool align_corners)
+Status CLScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
 {
-    BorderSize border           = BorderSize(1);
-    const bool is_align_corners = policy == InterpolationPolicy::BILINEAR
-                                  && sampling_policy == SamplingPolicy::TOP_LEFT
-                                  && align_corners;
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, is_align_corners));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), policy, border_mode, sampling_policy, border).first);
+    BorderSize border = BorderSize(1);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), info, border).first);
 
     return Status{};
 }
@@ -180,30 +164,26 @@
     return _output;
 }
 
-void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy, bool align_corners)
+void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, border_mode, sampling_policy, align_corners);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy,
-                              bool align_corners)
+void CLScaleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
 {
-    _align_corners = policy == InterpolationPolicy::BILINEAR
-                     && sampling_policy == SamplingPolicy::TOP_LEFT
-                     && align_corners;
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), info));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, _align_corners));
-
-    _input               = input;
-    _output              = output;
-    _interpolationPolicy = policy;
-    _data_layout         = input->info()->data_layout();
+    _input                = input;
+    _output               = output;
+    _interpolation_policy = info.interpolation_policy;
+    _data_layout          = input->info()->data_layout();
+    _align_corners        = info.align_corners;
 
     float wr = 0.f;
     float hr = 0.f;
     std::tie(wr, hr) = calculate_scale_factors(*input->info(), *output->info(), _align_corners);
 
-    const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && policy == InterpolationPolicy::BILINEAR;
+    const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && _interpolation_policy == InterpolationPolicy::BILINEAR;
 
     const int  idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const int  idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
@@ -212,14 +192,15 @@
     // Compute actual border size
     BorderSize border = border_size();
 
+    auto interpolation_policy_to_use = _interpolation_policy;
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    if(_interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
     {
-        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+        interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), policy, border_mode, sampling_policy, border);
+    auto win_config = validate_and_configure_window(input->info(), output->info(), info, border);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -227,9 +208,10 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DBORDER_SIZE=" + support::cpp11::to_string(border.right));
-    build_opts.add_option_if(border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+    build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
     build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
-    build_opts.add_option_if_else(sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+    build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+    build_opts.add_option_if(_align_corners, "-DALIGN_CORNERS");
     if(call_quantized_kernel)
     {
         const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
@@ -237,7 +219,7 @@
         build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
     }
 
-    std::string interpolation_name = string_from_interpolation_policy(policy);
+    std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
     std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
     std::string kernel_name = "scale_" + interpolation_name;
     kernel_name += call_quantized_kernel ? "_quantized_" : "_";
@@ -256,8 +238,8 @@
 
     // Set config_id for enabling LWS tuning
     _config_id = "scale_";
-    _config_id += (border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
-    _config_id += (sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
+    _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
+    _config_id += (info.sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
     _config_id += (is_nhwc ? "nhwc" : "nchw");
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(0));

diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
index cb65744..3172966 100644
--- a/src/core/CL/kernels/CLScharr3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLScharr3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index 2789764..1244068 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
index 12d04d9..86dcf22 100644
--- a/src/core/CL/kernels/CLSobel3x3Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
index a60bb0b..e010fdd 100644
--- a/src/core/CL/kernels/CLSobel5x5Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
index a5fbe54..c2b4bec 100644
--- a/src/core/CL/kernels/CLSobel7x7Kernel.cpp
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
index 09deb94..c7881b9 100644
--- a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -129,6 +129,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(sum, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type));
 
     // Note: output should always have a scale of 1/256 and offset 0
     const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);

diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 5900b08..3e0ac74 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index 072e992..877d426 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 33797d7..c283c44 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,10 +27,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
@@ -98,7 +95,7 @@
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DAXIS=" + support::cpp11::to_string(axis));
     build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(input->info()->dimension(2)));
     build_opts.add_option("-DDST_DIM3=" + support::cpp11::to_string(output->info()->dimension(3)));

diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index 18f0227..f7b7290 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/bit_ops.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/StringSupport.h"
 
@@ -66,7 +67,7 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
                                                         const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                                         int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
@@ -84,29 +85,14 @@
 }
 } // namespace
 
-CLStridedSliceKernel::CLStridedSliceKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
-}
-
-void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
-    _input  = input;
-    _output = output;
-
-    const TensorShape &input_shape = input->info()->tensor_shape();
+    const TensorShape &input_shape = input->tensor_shape();
 
     Coordinates starts_abs;
     Coordinates ends_abs;
@@ -117,12 +103,12 @@
                                                         begin_mask, end_mask, shrink_axis_mask);
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     // Enable multiple elements processing along x if stride_x is 1 and output width greater than the access vector size
-    const int  vec_size_x     = 16 / input->info()->element_size();
-    const int  output_width_x = output->info()->tensor_shape().x();
+    const int  vec_size_x     = 16 / input->element_size();
+    const int  output_width_x = output->tensor_shape().x();
     const bool is_shrink_on_x = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, 0);
     const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
 
@@ -137,7 +123,7 @@
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
     for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
@@ -150,8 +136,8 @@
     build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
                                   "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
                                   "-DSRC_DEPTH=1");
-    build_opts.add_option_if_else(_output->info()->num_dimensions() > 2,
-                                  "-DDST_DEPTH=" + support::cpp11::to_string(_output->info()->tensor_shape().z()),
+    build_opts.add_option_if_else(output->num_dimensions() > 2,
+                                  "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()),
                                   "-DDST_DEPTH=1");
 
     // Create kernel
@@ -160,11 +146,11 @@
     // Set config_id for enabling LWS tuning
     _config_id = "strided_slice";
     _config_id += "_";
-    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += lower_string(string_from_data_type(input->data_type()));
     for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         _config_id += "_";
-        _config_id += support::cpp11::to_string(input->info()->dimension(i));
+        _config_id += support::cpp11::to_string(input->dimension(i));
         _config_id += "_";
         _config_id += support::cpp11::to_string(starts_abs[i]);
         _config_id += "_";
@@ -186,19 +172,22 @@
     return Status{};
 }
 
-void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_4D();
 
     do
     {
         unsigned int idx = 0;
-        add_4D_tensor_argument(idx, _input, slice);
-        add_4D_tensor_argument(idx, _output, slice);
+        add_4D_tensor_argument(idx, src, slice);
+        add_4D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
     }
     while(window_collapsed.slide_window_slice_4D(slice));

diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp
index 07827d5..3b8ca60 100644
--- a/src/core/CL/kernels/CLTableLookupKernel.cpp
+++ b/src/core/CL/kernels/CLTableLookupKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp
index 4f98463..de81644 100644
--- a/src/core/CL/kernels/CLThresholdKernel.cpp
+++ b/src/core/CL/kernels/CLThresholdKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,16 +32,14 @@
 
 #include <string>
 
-using namespace arm_compute;
-
-void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                                  uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+namespace arm_compute
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, false_value, true_value, type, upper);
+void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLThresholdKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                                  uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+void CLThresholdKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
@@ -49,7 +47,7 @@
     // Construct kernel name
     std::string kernel_name = "threshold";
 
-    switch(type)
+    switch(info.type)
     {
         case ThresholdType::BINARY:
             kernel_name += "_binary";
@@ -67,16 +65,17 @@
 
     // Set arguments
     unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
-    _kernel.setArg(idx++, false_value);
-    _kernel.setArg(idx++, true_value);
-    _kernel.setArg(idx++, threshold);
+    _kernel.setArg(idx++, info.false_value);
+    _kernel.setArg(idx++, info.true_value);
+    _kernel.setArg(idx++, info.threshold);
 
-    if(ThresholdType::RANGE == type)
+    if(ThresholdType::RANGE == info.type)
     {
-        _kernel.setArg(idx++, upper);
+        _kernel.setArg(idx++, info.upper);
     }
 
     // Make sure _kernel is initialized before calling the parent's configure
     constexpr unsigned int num_elems_processed_per_iteration = 16;
     ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
 }
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index 2838251..bba1525 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
index a28b685..a47d956 100644
--- a/src/core/CL/kernels/CLTransposeKernel.cpp
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index dd6f85f..1010550 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,14 +52,19 @@
     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
-    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.x() != 2 || info.y() != 2, "Only stride 2 is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(upsampling_policy != InterpolationPolicy::NEAREST_NEIGHBOR, "Only nearest neighbor policy supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_width) != info.x() * input->dimension(idx_width));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
 
     return Status{};
 }
@@ -126,7 +131,7 @@
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option(("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(input->info()->element_size())));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE_IN=" + support::cpp11::to_string(_num_elems_processed_per_iteration_input_x));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration_x));
     build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X_IN=" + support::cpp11::to_string(std::max<int>(_input->info()->dimension(0) - _num_elems_processed_per_iteration_input_x, 0)));

diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
index c40c614..e8da803 100644
--- a/src/core/CL/kernels/CLWarpAffineKernel.cpp
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
index bc08549..dc7c359 100644
--- a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
index 873f3b3..267957e 100644
--- a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
index aba2af1..76100c2 100644
--- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,14 +28,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_info.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "support/StringSupport.h"
@@ -51,8 +47,8 @@
     // The window needs to be based on the output
     Window             win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
     AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration), input1->dimension(1));
-    const unsigned int input2_right_padding = (output->dimension(0) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1->dimension(
-                                                  0) + num_elems_processed_per_iteration - input2->dimension(0);
+    const unsigned int input2_right_padding = ((output->dimension(0) / num_elems_processed_per_iteration) * num_elems_processed_per_iteration - input1->dimension(0) - input2->dimension(
+                                                   0)) % num_elems_processed_per_iteration;
     AccessWindowStatic input2_access(input2, -(input1->dimension(0) % num_elems_processed_per_iteration),
                                      0, input2->dimension(0) + input2_right_padding, input2->dimension(1));
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
@@ -82,11 +78,6 @@
 }
 } // namespace
 
-CLWidthConcatenate2TensorsKernel::CLWidthConcatenate2TensorsKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
 Status CLWidthConcatenate2TensorsKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output));
@@ -94,35 +85,26 @@
     return Status{};
 }
 
-void CLWidthConcatenate2TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
-}
-
-void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info()));
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output));
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->dimension(2)));
+    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->dimension(0)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->element_size()));
 
     // If input have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output->info(), input1->info(), input2->info());
-    if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && have_different_qinfo)
+    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output, input1, input2);
+    if(is_data_type_quantized_asymmetric(input1->data_type()) && have_different_qinfo)
     {
-        const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = output->info()->quantization_info().uniform();
+        const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform();
+        const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info  = output->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
         build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
@@ -136,16 +118,16 @@
     _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
+    auto win_config = validate_and_configure_window(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
     ICLKernel::configure_internal(std::get<1>(win_config));
 
     // Set output valid region
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 
     // Pass paddings as arguments to the kernel
-    const unsigned int input1_width         = input1->info()->dimension(0);
+    const unsigned int input1_width         = input1->dimension(0);
     const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width;
     const unsigned int input2_left_padding  = input1_width % num_elems_processed_per_iteration;
     unsigned int       idx0                 = 3 * num_arguments_per_4D_tensor();
@@ -154,30 +136,34 @@
 
     // Set config_id for enabling LWS tuning
     _config_id = "concatenate_width_x2_";
-    _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+    _config_id += lower_string(string_from_data_type(input1->data_type()));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->info()->dimension(0));
+    _config_id += support::cpp11::to_string(input1->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->info()->dimension(1));
+    _config_id += support::cpp11::to_string(input1->dimension(1));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->info()->dimension(0));
+    _config_id += support::cpp11::to_string(input2->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->info()->dimension(1));
+    _config_id += support::cpp11::to_string(input2->dimension(1));
 }
 
-void CLWidthConcatenate2TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
     Window slice = window.first_slice_window_4D();
 
+    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
     do
     {
         unsigned int idx = 0;
-        add_4D_tensor_argument(idx, _input1, slice);
-        add_4D_tensor_argument(idx, _input2, slice);
-        add_4D_tensor_argument(idx, _output, slice);
+        add_4D_tensor_argument(idx, src0, slice);
+        add_4D_tensor_argument(idx, src1, slice);
+        add_4D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, window, lws_hint());
     }
     while(window.slide_window_slice_4D(slice));

diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
index e5eb8b3..0377eb7 100644
--- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,14 +28,11 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_info.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "support/StringSupport.h"
@@ -101,7 +98,6 @@
 } // namespace
 
 CLWidthConcatenate4TensorsKernel::CLWidthConcatenate4TensorsKernel()
-    : _input1(nullptr), _input2(nullptr), _input3(nullptr), _input4(nullptr), _output(nullptr)
 {
 }
 
@@ -112,42 +108,33 @@
     return Status{};
 }
 
-void CLWidthConcatenate4TensorsKernel::configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, input3, input4, output);
-}
-
-void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4,
-                                                 ICLTensor *output)
+void CLWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
+                                                 ITensorInfo *input1, ITensorInfo *input2,
+                                                 ITensorInfo *input3, ITensorInfo *input4,
+                                                 ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, input3, input4, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), input3->info(), input4->info(), output->info()));
-
-    _input1 = input1;
-    _input2 = input2;
-    _input3 = input3;
-    _input4 = input4;
-    _output = output;
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, input3, input4, output));
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input1->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
-    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0)));
-    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->info()->dimension(0)));
-    build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0)));
-    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size()));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input1->dimension(2)));
+    build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->dimension(0)));
+    build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(input2->dimension(0)));
+    build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->dimension(0)));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->element_size()));
 
     // If input have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output->info(), input1->info(), input2->info(), input3->info(), input4->info());
-    if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && have_different_qinfo)
+    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(output, input1, input2, input3, input4);
+    if(is_data_type_quantized_asymmetric(input1->data_type()) && have_different_qinfo)
     {
-        const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
-        const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
-        const UniformQuantizationInfo iq3_info = input3->info()->quantization_info().uniform();
-        const UniformQuantizationInfo iq4_info = input4->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info  = output->info()->quantization_info().uniform();
+        const UniformQuantizationInfo iq1_info = input1->quantization_info().uniform();
+        const UniformQuantizationInfo iq2_info = input2->quantization_info().uniform();
+        const UniformQuantizationInfo iq3_info = input3->quantization_info().uniform();
+        const UniformQuantizationInfo iq4_info = input4->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info  = output->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
         build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
@@ -165,18 +152,18 @@
     _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options());
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), input3->info(), input4->info(), output->info());
+    auto win_config = validate_and_configure_window(input1, input2, input3, input4, output);
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
     ICLKernel::configure_internal(std::get<1>(win_config));
 
     // Set output valid region
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 
     // Pass paddings as arguments to the kernel
-    const unsigned int input1_width = input1->info()->dimension(0);
-    const unsigned int input2_width = input2->info()->dimension(0);
-    const unsigned int input3_width = input3->info()->dimension(0);
+    const unsigned int input1_width = input1->dimension(0);
+    const unsigned int input2_width = input2->dimension(0);
+    const unsigned int input3_width = input3->dimension(0);
 
     const unsigned int input1_right_padding = ceil_to_multiple(input1_width, num_elems_processed_per_iteration) - input1_width;
     const unsigned int input2_left_padding  = input1_width % num_elems_processed_per_iteration;
@@ -196,40 +183,46 @@
 
     // Set config_id for enabling LWS tuning
     _config_id = "concatenate_width_x4_";
-    _config_id += lower_string(string_from_data_type(input1->info()->data_type()));
+    _config_id += lower_string(string_from_data_type(input1->data_type()));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->info()->dimension(0));
+    _config_id += support::cpp11::to_string(input1->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input1->info()->dimension(1));
+    _config_id += support::cpp11::to_string(input1->dimension(1));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->info()->dimension(0));
+    _config_id += support::cpp11::to_string(input2->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input2->info()->dimension(1));
+    _config_id += support::cpp11::to_string(input2->dimension(1));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input3->info()->dimension(0));
+    _config_id += support::cpp11::to_string(input3->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input3->info()->dimension(1));
+    _config_id += support::cpp11::to_string(input3->dimension(1));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input4->info()->dimension(0));
+    _config_id += support::cpp11::to_string(input4->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(input4->info()->dimension(1));
+    _config_id += support::cpp11::to_string(input4->dimension(1));
 }
 
-void CLWidthConcatenate4TensorsKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
+    const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
+    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
     Window slice = window.first_slice_window_4D();
 
     do
     {
         unsigned int idx = 0;
-        add_4D_tensor_argument(idx, _input1, slice);
-        add_4D_tensor_argument(idx, _input2, slice);
-        add_4D_tensor_argument(idx, _input3, slice);
-        add_4D_tensor_argument(idx, _input4, slice);
-        add_4D_tensor_argument(idx, _output, slice);
+        add_4D_tensor_argument(idx, src0, slice);
+        add_4D_tensor_argument(idx, src1, slice);
+        add_4D_tensor_argument(idx, src2, slice);
+        add_4D_tensor_argument(idx, src3, slice);
+        add_4D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, window, lws_hint());
     }
     while(window.slide_window_slice_4D(slice));

diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
index 8eba293..d40597f 100644
--- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp
+++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,19 +27,14 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "support/StringSupport.h"
 
-#include <map>
-
 namespace arm_compute
 {
 namespace
@@ -79,7 +74,7 @@
 } // namespace
 
 CLWidthConcatenateLayerKernel::CLWidthConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _width_offset(0)
+    : _width_offset(0)
 {
 }
 
@@ -90,31 +85,24 @@
     return Status{};
 }
 
-void CLWidthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), input, width_offset, output);
-}
-
-void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int width_offset, ICLTensor *output)
+void CLWidthConcatenateLayerKernel::configure(const CLCompileContext &compile_context, ITensorInfo *input, unsigned int width_offset, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), width_offset, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, width_offset, output));
 
-    _input        = input;
-    _output       = output;
     _width_offset = width_offset;
 
     // Add build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset));
-    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->dimension(2)));
 
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info())
+    if(is_data_type_quantized_asymmetric(input->data_type()) && input->quantization_info() != output->quantization_info())
     {
-        const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
-        const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
+        const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
 
         build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset));
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset));
@@ -125,23 +113,26 @@
     // Create kernel
     _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options());
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), width_offset, output->info());
+    auto win_config = validate_and_configure_window(input, width_offset, output);
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
     ICLKernel::configure_internal(std::get<1>(win_config));
 
     // Set output valid region
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 }
 
-void CLWidthConcatenateLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLWidthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
+    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
     unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, window);
-    add_4D_tensor_argument(idx, _output, window);
+    add_4D_tensor_argument(idx, src, window);
+    add_4D_tensor_argument(idx, dst, window);
     enqueue(queue, *this, window, lws_hint());
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index 6ced0a1..4a1c48a 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index 0915453..6b1b86a 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 96383ff..19f61b1 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CL/kernels/CLYOLOLayerKernel.cpp b/src/core/CL/kernels/CLYOLOLayerKernel.cpp
index 3a9f822..3dd9aa2 100644
--- a/src/core/CL/kernels/CLYOLOLayerKernel.cpp
+++ b/src/core/CL/kernels/CLYOLOLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index e4c3b77..55119d8 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/ICPPSimpleKernel.cpp b/src/core/CPP/ICPPSimpleKernel.cpp
index 01fb016..126bf54 100644
--- a/src/core/CPP/ICPPSimpleKernel.cpp
+++ b/src/core/CPP/ICPPSimpleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 3058a0c..917a6ad 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
index 739f389..a0cfb3b 100644
--- a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
+++ b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
index 5037ac5..ec03b72 100644
--- a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index 7ea59ba..89e3058 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 9d89836..1d1f0cd 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
index edc5e40..603b05e 100644
--- a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
+++ b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
index 7f284d4..7ba8d7c 100644
--- a/src/core/CPP/kernels/CPPTopKVKernel.cpp
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index 8348b43..ff4ffb6 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index 8d321c0..5c8d45c 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/GCCoreRuntimeContext.cpp b/src/core/GLES_COMPUTE/GCCoreRuntimeContext.cpp
index 311dfd2..a374c59 100644
--- a/src/core/GLES_COMPUTE/GCCoreRuntimeContext.cpp
+++ b/src/core/GLES_COMPUTE/GCCoreRuntimeContext.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/GCHelpers.cpp b/src/core/GLES_COMPUTE/GCHelpers.cpp
index b974007..0c9ed82 100644
--- a/src/core/GLES_COMPUTE/GCHelpers.cpp
+++ b/src/core/GLES_COMPUTE/GCHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index 4b3c5aa..5acebfb 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp
index 4da35c7..eb96f91 100644
--- a/src/core/GLES_COMPUTE/IGCKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp b/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp
index 5bb479e..4c38412 100644
--- a/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp b/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp
index 61225d8..df85285 100644
--- a/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
index 459601e..6609f45 100644
--- a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/IGCTensor.cpp b/src/core/GLES_COMPUTE/IGCTensor.cpp
index 19af777..0f310b8 100644
--- a/src/core/GLES_COMPUTE/IGCTensor.cpp
+++ b/src/core/GLES_COMPUTE/IGCTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/OpenGLES.cpp b/src/core/GLES_COMPUTE/OpenGLES.cpp
index e93b360..f56bcfa 100644
--- a/src/core/GLES_COMPUTE/OpenGLES.cpp
+++ b/src/core/GLES_COMPUTE/OpenGLES.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs
index d06de3a..c5196a1 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
index e5411de..983b31d 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h
index e0eacf8..e353b74 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
index 2ab6d5e..faaf204 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
index 81be967..f38a90b 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
index 49b3954..d1d1a86 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 40b5a2b..d40cbbb 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
index 134cc10..3e7e1fd 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
index b42c09b..c455489 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
index e51cc37..c9a2121 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
index 728e964..e47db54 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
index f4c8cb9..89ac8fe 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs
index 4f87b92..4e96a5e 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index e51908b..d41b48c 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
index 014ff40..4e35517 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
index f3cb52e..a5ec68c 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalize_planar_yuv_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalize_planar_yuv_layer.cs
index 18a9af7..6a46845 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/normalize_planar_yuv_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/normalize_planar_yuv_layer.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs
index 01e0f8a..936839f 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
index aa639b2..6ca4265 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/scale.cs b/src/core/GLES_COMPUTE/cs_shaders/scale.cs
index 5b2141f..63be478 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/scale.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/scale.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
index 6967736..0293943 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs b/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs
index a0af315..cd2dcde 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
index 89bf9fb..72ade20 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017, 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/egl_entries.in b/src/core/GLES_COMPUTE/egl_entries.in
index 64ccda6..2fff315 100644
--- a/src/core/GLES_COMPUTE/egl_entries.in
+++ b/src/core/GLES_COMPUTE/egl_entries.in

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/gl_entries.in b/src/core/GLES_COMPUTE/gl_entries.in
index 17e3aee..80bdb91 100644
--- a/src/core/GLES_COMPUTE/gl_entries.in
+++ b/src/core/GLES_COMPUTE/gl_entries.in

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
index e7ff136..f0a5003 100644
--- a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
index 5aad807..1c02f41 100644
--- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
index 0a5fe11..06c3486 100644
--- a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index a1f7cd7..3bd34ac 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
index 1e48dc8..4fe6484 100644
--- a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
index c6345ba..458cb63 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
index c60f468..cb70dae 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index f3e47d9..302b21b 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
index 9368770..5c6722a 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
index d424f0d..3b3118b 100644
--- a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
index 28be710..e0f7e95 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
index f4c84f3..c9eb433 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
index 0429824..e8298bc 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index 2a85e0d..dd03faf 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
index 6ebd8dd..4190163 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
index 1890cf7..64f2d63 100644
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
index 094d895..5fa1987 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
index ff885da..6a79990 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
index 69c97a8..45aa06c 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index 36499eb..a592c09 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
index a85a0e7..cf10b92 100644
--- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,13 +45,13 @@
     return BorderSize(1);
 }
 
-void GCScaleKernel::configure(const IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy)
+void GCScaleKernel::configure(const IGCTensor *input, IGCTensor *output, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON(output == input);
-    ARM_COMPUTE_ERROR_ON(policy != InterpolationPolicy::NEAREST_NEIGHBOR);
+    ARM_COMPUTE_ERROR_ON(info.interpolation_policy != InterpolationPolicy::NEAREST_NEIGHBOR);
 
     _input  = input;
     _output = output;
@@ -61,16 +61,18 @@
     const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
 
     // Compute actual border size
-    BorderSize border = border_undefined ? BorderSize(0) : border_size();
+    const bool border_undefined = info.border_mode == BorderMode::UNDEFINED;
+    BorderSize border           = border_undefined ? BorderSize(0) : border_size();
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+    auto interpolation_policy_to_use = info.interpolation_policy;
+    if(interpolation_policy_to_use == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
     {
-        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+        interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
     else
     {
-        ARM_COMPUTE_ERROR_ON(policy == InterpolationPolicy::AREA);
+        ARM_COMPUTE_ERROR_ON(interpolation_policy_to_use == InterpolationPolicy::AREA);
     }
 
     // Create kernel
@@ -81,7 +83,7 @@
 
     build_opts.emplace("#define DATA_TYPE_FP16");
     build_opts.emplace("#define BORDER_SIZE " + support::cpp11::to_string(border.right));
-    if(sampling_policy == SamplingPolicy::TOP_LEFT)
+    if(info.sampling_policy == SamplingPolicy::TOP_LEFT)
     {
         build_opts.emplace("#define SAMPLING_POLICY_TOP_LEFT");
     }
@@ -106,7 +108,7 @@
         build_opts.emplace("#define SCALE_NEAREST_GENERIC");
     }
 
-    std::string interpolation_name = string_from_interpolation_policy(policy); // NOLINT
+    std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use); // NOLINT
     std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
     std::string kernel_name = "scale_" + interpolation_name;
     _kernel                 = GCKernelLibrary::get().create_kernel(kernel_name, build_opts);
@@ -130,8 +132,8 @@
 
     output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()),
                                                                      output->info()->tensor_shape(),
-                                                                     policy,
-                                                                     sampling_policy,
+                                                                     info.interpolation_policy,
+                                                                     info.sampling_policy,
                                                                      border_undefined));
 
     IGCKernel::configure(win);

diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
index f250801..f4ed961 100644
--- a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
index 16dafaf..d06be9b 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
index ead50ce..66b4a55 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
index 07c09fa..9a430b4 100644
--- a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index c50e7a1..7e4ef2f 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/HOGInfo.cpp b/src/core/HOGInfo.cpp
index bfee12c..1c27585 100644
--- a/src/core/HOGInfo.cpp
+++ b/src/core/HOGInfo.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index c0af3bb..bfc4a8d 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index be65102..8328012 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/IDistribution.cpp b/src/core/IDistribution.cpp
index 7d71869..c7e7346 100644
--- a/src/core/IDistribution.cpp
+++ b/src/core/IDistribution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/IDistribution1D.cpp b/src/core/IDistribution1D.cpp
index f304289..4dbb081 100644
--- a/src/core/IDistribution1D.cpp
+++ b/src/core/IDistribution1D.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp
index 6450a4f..287cd04 100644
--- a/src/core/IKernel.cpp
+++ b/src/core/IKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 607f5ce..e263596 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,8 @@
 #include <cstring>
 #include <limits>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void ITensor::copy_from(const ITensor &src)
 {
     if(&src == this)
@@ -64,7 +64,8 @@
 
     const size_t line_size = src_info->element_size() * src_info->dimension(0);
 
-    execute_window_loop(win_src, [&](const Coordinates &)
+    execute_window_loop(
+        win_src, [&](const Coordinates &)
     {
         memcpy(dst_it.ptr(), src_it.ptr(), line_size);
     },
@@ -168,3 +169,4 @@
 {
     _is_used = false;
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/ITensorPack.cpp b/src/core/ITensorPack.cpp
new file mode 100644
index 0000000..7a54a8b
--- /dev/null
+++ b/src/core/ITensorPack.cpp

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/ITensorPack.h"
+
+#include "arm_compute/core/ITensor.h"
+
+namespace arm_compute
+{
+void ITensorPack::add_tensor(int id, ITensor *tensor)
+{
+    _pack[id] = PackElement(tensor);
+}
+
+void ITensorPack::add_tensor(int id, const ITensor *tensor)
+{
+    _pack[id] = PackElement(tensor);
+}
+
+const ITensor *ITensorPack::get_const_tensor(int id) const
+{
+    auto it = _pack.find(id);
+    if(it != _pack.end())
+    {
+        return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
+    }
+    return nullptr;
+}
+
+ITensor *ITensorPack::get_tensor(int id)
+{
+    auto it = _pack.find(id);
+    return it != _pack.end() ? it->second.tensor : nullptr;
+}
+
+size_t ITensorPack::size() const
+{
+    return _pack.size();
+}
+
+bool ITensorPack::empty() const
+{
+    return _pack.empty();
+}
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/MultiImageInfo.cpp b/src/core/MultiImageInfo.cpp
index 1e40a77..9ec1e1d 100644
--- a/src/core/MultiImageInfo.cpp
+++ b/src/core/MultiImageInfo.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/NETracePoint.cpp b/src/core/NEON/NETracePoint.cpp
index 298c6f4..cb0dc14 100644
--- a/src/core/NEON/NETracePoint.cpp
+++ b/src/core/NEON/NETracePoint.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
index 62285e0..3d4800f 100644
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,15 +32,9 @@
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute;
 
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
 void abs_diff_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -140,7 +134,7 @@
     {
         set_format_if_unknown(*output->info(), Format::S16);
     }
-    else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32)
+    else if(input1->info()->data_type() == DataType::U8 || input2->info()->data_type() == DataType::U8)
     {
         set_format_if_unknown(*output->info(), Format::U8);
     }
@@ -210,3 +204,4 @@
 
     _func(_input1, _input2, _output, window);
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
index d601adc..7c85f69 100644
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 8e91e6b..b15df31 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,28 +27,23 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/NEON/NESymm.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
 #include <arm_neon.h>
-#include <array>
-#include <cmath>
-#include <map>
 #include <set>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
 
     const static std::set<ActivationLayerInfo::ActivationFunction> qasymm8_supported_activations =
     {
@@ -95,7 +90,7 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output)
 {
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps());
@@ -116,23 +111,15 @@
 } // namespace
 
 NEActivationLayerKernel::NEActivationLayerKernel()
-    : _input(nullptr), _output(nullptr), _func(nullptr), _act_info()
+    : _func(nullptr), _act_info()
 {
 }
 
-void NEActivationLayerKernel::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+void NEActivationLayerKernel::configure(const ITensorInfo *input, ITensorInfo *output, ActivationLayerInfo activation_info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    _input    = input;
     _act_info = activation_info;
-    _output   = input;
-
-    // Out-of-place calculation
-    if(output != nullptr)
-    {
-        _output = output;
-    }
 
     // Disabled activation, thus no operation needed
     if(!activation_info.enabled())
@@ -140,7 +127,7 @@
         _func = nullptr;
     }
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, activation_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, activation_info));
 
     // Activation functions : FP32
     static std::map<ActivationFunction, ActivationFunctionExecutorPtr> act_map_f32 =
@@ -218,7 +205,7 @@
 
     };
 
-    switch(input->info()->data_type())
+    switch(input->data_type())
     {
         case DataType::QASYMM8_SIGNED:
             _func = act_map_qasymm8_signed[activation_info.activation()];
@@ -242,14 +229,14 @@
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr);
+    auto win_config = validate_and_configure_window(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICPPKernel::configure(win_config.second);
 }
 
 template <ActivationLayerInfo::ActivationFunction F, typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-NEActivationLayerKernel::activation(const Window &window)
+NEActivationLayerKernel::activation(const ITensor *src, ITensor *dst, const Window &window)
 {
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
@@ -262,11 +249,16 @@
     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(_input, win_collapsed);
-    Iterator output(_output, win_collapsed);
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
 
-    const auto epsilon     = wrapper::vdup_n(static_cast<T>(1e-24), ExactTagType{});
-    const auto const_1     = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+    // A small delta added to the input to prevent NAN values caused by zeros in inputs to SQRT
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    const auto delta = wrapper::vdup_n(static_cast<T>(1e-7), ExactTagType {});
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    const auto delta = wrapper::vdup_n(static_cast<T>(1e-24), ExactTagType {});
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    const auto const_1     = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType {});
     const auto const_0     = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
     const auto const_6     = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
     const auto const_3     = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
@@ -318,7 +310,7 @@
                     tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
                     break;
                 case ActivationFunction::SQRT:
-                    tmp = wrapper::vinv(wrapper::vinvsqrt(vin + epsilon));
+                    tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, delta)));
                     break;
                 case ActivationFunction::SQUARE:
                     tmp = wrapper::vmul(vin, vin);
@@ -397,7 +389,7 @@
 }
 
 template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type NEActivationLayerKernel::activation(const ITensor *src, ITensor *dst, const Window &window)
 {
     const int                window_step_x  = 16 / sizeof(T);
     const auto               window_start_x = static_cast<int>(window.x().start());
@@ -407,11 +399,11 @@
     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(_input, win_collapsed);
-    Iterator output(_output, win_collapsed);
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
 
-    const UniformQuantizationInfo qi_in           = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out          = _output->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
     const qasymm8x16_t            va              = vdupq_n_u8(quantize_qasymm8(_act_info.a(), qi_in));
     const qasymm8x16_t            vb              = vdupq_n_u8(quantize_qasymm8(_act_info.b(), qi_in));
     const qasymm8_t               a               = quantize_qasymm8(_act_info.a(), qi_in);
@@ -574,7 +566,7 @@
 }
 
 template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qasymm8_signed_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+typename std::enable_if<std::is_same<T, qasymm8_signed_t>::value, void>::type NEActivationLayerKernel::activation(const ITensor *src, ITensor *dst, const Window &window)
 {
     const int                window_step_x  = 16 / sizeof(T);
     const auto               window_start_x = static_cast<int>(window.x().start());
@@ -584,11 +576,11 @@
     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(_input, win_collapsed);
-    Iterator output(_output, win_collapsed);
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
 
-    const UniformQuantizationInfo qi_in           = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out          = _output->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
     const qasymm8x16_signed_t     va              = vdupq_n_s8(quantize_qasymm8_signed(_act_info.a(), qi_in));
     const qasymm8x16_signed_t     vb              = vdupq_n_s8(quantize_qasymm8_signed(_act_info.b(), qi_in));
     const qasymm8_signed_t        a               = quantize_qasymm8_signed(_act_info.a(), qi_in);
@@ -751,7 +743,7 @@
 }
 
 template <ActivationLayerInfo::ActivationFunction F, typename T>
-typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type NEActivationLayerKernel::activation(const Window &window)
+typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type NEActivationLayerKernel::activation(const ITensor *src, ITensor *dst, const Window &window)
 {
     const int                window_step_x  = 16 / sizeof(T);
     const auto               window_start_x = static_cast<int>(window.x().start());
@@ -761,11 +753,11 @@
     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(_input, win_collapsed);
-    Iterator output(_output, win_collapsed);
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
 
-    const UniformQuantizationInfo qi_in    = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = _output->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
     const auto                    vconst_1 = vdupq_n_f32(1.f);
     const float32x4_t             va_f32   = vdupq_n_f32(_act_info.a());
     const float32x4_t             vb_f32   = vdupq_n_f32(_act_info.b());
@@ -858,7 +850,7 @@
     return Status{};
 }
 
-void NEActivationLayerKernel::run(const Window &window, const ThreadInfo &info)
+void NEActivationLayerKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     // Early exit on disabled activation
     if(!_act_info.enabled())
@@ -871,5 +863,10 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (this->*_func)(window);
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    (this->*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
+                   tensors.get_tensor(TensorType::ACL_DST),
+                   window);
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
index 3532526..5f5a3e5 100644
--- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,33 +26,20 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
-#include <algorithm>
-#include <arm_neon.h>
-#include <cstdint>
 #include <map>
 #include <string>
 
-using namespace arm_compute;
-
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 namespace
 {
-template <typename T, bool is_sat>
-void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, ConvertPolicy policy, const Window &window)
+template <typename T>
+void add_same(const ITensor *in1, const ITensor *in2, ITensor *out, const ConvertPolicy policy, const Window &window)
 {
-    ARM_COMPUTE_UNUSED(policy);
-
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
@@ -97,7 +84,7 @@
             for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
                 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                const auto res             = is_sat ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
+                const auto res             = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
                 wrapper::vstore(output_ptr + x, res);
             }
 
@@ -105,7 +92,7 @@
             for(; x < window_end_x; ++x)
             {
                 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = is_sat ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
+                *(output_ptr + x)          = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
             }
         },
         broadcast_input, non_broadcast_input, output);
@@ -132,7 +119,7 @@
             {
                 const auto val1 = wrapper::vloadq(input1_ptr + x);
                 const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = is_sat ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
+                const auto res  = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
                 wrapper::vstore(output_ptr + x, res);
             }
 
@@ -141,7 +128,7 @@
             {
                 const auto val1   = *(input1_ptr + x);
                 const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = is_sat ? wrapper::add_sat(val1, val2) : val1 + val2;
+                *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
             }
         },
         input1, input2, output);
@@ -169,11 +156,7 @@
     const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
     const UniformQuantizationInfo oq_info  = out->info()->quantization_info().uniform();
 
-    const float32x4_t vscale1    = vdupq_n_f32(iq1_info.scale);
-    const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-    const int32x4_t   voffset1   = vdupq_n_s32(iq1_info.offset);
-    const int32x4_t   voffset2   = vdupq_n_s32(iq2_info.offset);
     const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
 
     if(is_broadcast_across_x)
@@ -186,6 +169,11 @@
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
         const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
 
+        const float32x4_t vscale1  = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
+        const float32x4_t vscale2  = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
+        const int32x4_t   voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
+        const int32x4_t   voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
+
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
@@ -268,6 +256,11 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
+        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
+        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
+        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
+        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
+
         execute_window_loop(win, [&](const Coordinates &)
         {
             const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
@@ -356,11 +349,7 @@
     const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
     const UniformQuantizationInfo oq_info  = out->info()->quantization_info().uniform();
 
-    const float32x4_t vscale1    = vdupq_n_f32(iq1_info.scale);
-    const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
-    const int32x4_t   voffset1   = vdupq_n_s32(iq1_info.offset);
-    const int32x4_t   voffset2   = vdupq_n_s32(iq2_info.offset);
     const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
 
     if(is_broadcast_across_x)
@@ -373,6 +362,11 @@
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
         const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
 
+        const float32x4_t vscale1  = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
+        const float32x4_t vscale2  = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
+        const int32x4_t   voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
+        const int32x4_t   voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
+
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
@@ -455,6 +449,10 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
+        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
+        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
+        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
+        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
         execute_window_loop(win, [&](const Coordinates &)
         {
             const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
@@ -828,8 +826,12 @@
     ARM_COMPUTE_UNUSED(policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
+                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
+                                                         DataType::S32, DataType::F32);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
 
@@ -847,6 +849,7 @@
             && !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)
             && !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)
+            && !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32 && output.data_type() == DataType::S32)
             && !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)
             && !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16)
             && !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)
@@ -861,7 +864,7 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo &input1, const ITensorInfo &input2, ITensorInfo &output)
 {
     const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
     const TensorShape &out_shape    = broadcast_pair.first;
@@ -875,6 +878,10 @@
         {
             set_format_if_unknown(output, Format::S16);
         }
+        if(input1.data_type() == DataType::S32 || input2.data_type() == DataType::S32)
+        {
+            set_format_if_unknown(output, Format::S32);
+        }
         else if(input1.data_type() == DataType::F16 || input2.data_type() == DataType::F16)
         {
             set_format_if_unknown(output, Format::F16);
@@ -908,17 +915,17 @@
 } // namespace
 
 NEArithmeticAdditionKernel::NEArithmeticAdditionKernel()
-    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _policy()
+    : _func(nullptr), _policy()
 {
 }
 
-void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
+void NEArithmeticAdditionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+    auto win_config = validate_and_configure_window(*input1, *input2, *output);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     static std::map<std::string, AddFunction *> map_function =
@@ -929,34 +936,33 @@
         { "add_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &add_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
         { "add_wrap_QSYMM16_QSYMM16_QSYMM16", &add_QSYMM16_QSYMM16_QSYMM16 },
         { "add_saturate_QSYMM16_QSYMM16_QSYMM16", &add_QSYMM16_QSYMM16_QSYMM16 },
-        { "add_wrap_U8_U8_U8", &add_same<uint8_t, false> },
-        { "add_saturate_U8_U8_U8", &add_same<uint8_t, true> },
+        { "add_wrap_U8_U8_U8", &add_same<uint8_t> },
+        { "add_saturate_U8_U8_U8", &add_same<uint8_t> },
         { "add_wrap_S16_U8_S16", &add_S16_U8_S16 },
         { "add_saturate_S16_U8_S16", &add_S16_U8_S16 },
         { "add_wrap_U8_S16_S16", &add_U8_S16_S16 },
         { "add_saturate_U8_S16_S16", &add_U8_S16_S16 },
         { "add_wrap_U8_U8_S16", &add_U8_U8_S16 },
         { "add_saturate_U8_U8_S16", &add_U8_U8_S16 },
-        { "add_wrap_S16_S16_S16", &add_same<int16_t, false> },
-        { "add_saturate_S16_S16_S16", &add_same<int16_t, true> },
-        { "add_wrap_F32_F32_F32", &add_same<float, false> },
-        { "add_saturate_F32_F32_F32", &add_same<float, false> },
+        { "add_wrap_S16_S16_S16", &add_same<int16_t> },
+        { "add_saturate_S16_S16_S16", &add_same<int16_t> },
+        { "add_wrap_S32_S32_S32", &add_same<int32_t> },
+        { "add_saturate_S32_S32_S32", &add_same<int32_t> },
+        { "add_wrap_F32_F32_F32", &add_same<float> },
+        { "add_saturate_F32_F32_F32", &add_same<float> },
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { "add_wrap_F16_F16_F16", &add_same<float16_t, false> },
-        { "add_saturate_F16_F16_F16", &add_same<float16_t, false> },
+        { "add_wrap_F16_F16_F16", &add_same<float16_t> },
+        { "add_saturate_F16_F16_F16", &add_same<float16_t> },
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     };
 
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
     _policy = policy;
 
     std::string function_to_call("add_");
     function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
-    function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
-    function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
-    function_to_call += string_from_data_type(output->info()->data_type());
+    function_to_call += string_from_data_type(input1->data_type()) + "_";
+    function_to_call += string_from_data_type(input2->data_type()) + "_";
+    function_to_call += string_from_data_type(output->data_type());
 
     auto it = map_function.find(function_to_call);
 
@@ -978,12 +984,16 @@
     return Status{};
 }
 
-void NEArithmeticAdditionKernel::run(const Window &window, const ThreadInfo &info)
+void NEArithmeticAdditionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(_input1, _input2, _output, _policy, window);
+    // Dispatch kernel
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC_0),
+             tensors.get_const_tensor(TensorType::ACL_SRC_1),
+             tensors.get_tensor(TensorType::ACL_DST),
+             _policy,
+             window);
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index 9b7b235..9237193 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/NEON/NEAsymm.h"
 #include "arm_compute/core/NEON/NESymm.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
@@ -33,437 +34,633 @@
 {
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
+quantize(float val, const QuantizationInfo &info)
 {
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t ta1 = vld1q_u8(input1.ptr());
-        const uint8x16_t ta2 = vld1q_u8(input2.ptr());
-
-        vst1q_u8(output.ptr(), vsubq_u8(ta1, ta2));
-    },
-    input1, input2, output);
+    return quantize_qasymm8_signed(val, info);
 }
 
-void sub_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
+quantize(float val, const QuantizationInfo &info)
 {
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t ta1 = vld1q_u8(input1.ptr());
-        const uint8x16_t ta2 = vld1q_u8(input2.ptr());
-
-        vst1q_u8(output.ptr(), vqsubq_u8(ta1, ta2));
-    },
-    input1, input2, output);
+    return quantize_qasymm8(val, info);
 }
 
-void sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+template <typename T>
+void sub_same(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
 {
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16 / sizeof(T);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
     Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
     Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
     Iterator output(out, window);
 
+    if(is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
+
+            const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
+                if(is_broadcast_input_2)
+                {
+                    res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
+                }
+                wrapper::vstore(output_ptr + x, res);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                auto       res             = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
+                if(is_broadcast_input_2)
+                {
+                    res = static_cast<T>(-1) * res;
+                }
+
+                *(output_ptr + x) = res;
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto val1 = wrapper::vloadq(input1_ptr + x);
+                const auto val2 = wrapper::vloadq(input2_ptr + x);
+                const auto res  = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
+                wrapper::vstore(output_ptr + x, res);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const auto val1   = *(input1_ptr + x);
+                const auto val2   = *(input2_ptr + x);
+                *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
+            }
+        },
+        input1, input2, output);
+    }
+}
+
+template <typename T>
+void sub_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
+{
+    ARM_COMPUTE_UNUSED(is_sat);
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 16;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
     const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
     const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
     const UniformQuantizationInfo oq_info  = out->info()->quantization_info().uniform();
 
-    execute_window_loop(window, [&](const Coordinates &)
+    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
+    const float32x4_t voffseto   = vdupq_n_f32(oq_info.offset);
+
+    if(is_broadcast_across_x)
     {
-        const float32x4x4_t ta1 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input1.ptr())), iq1_info);
-        const float32x4x4_t ta2 = vdequantize(vld1q_u8(reinterpret_cast<const qasymm8_t *>(input2.ptr())), iq2_info);
+        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+        const float32x4_t             vscale1              = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
+        const float32x4_t             vscale2              = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
+        const int32x4_t               voffset1             = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
+        const int32x4_t               voffset2             = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
 
-        const float32x4x4_t ta3 =
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
         {
+            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
+
+            const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+            const auto broadcast_value_vec = wrapper::vdup_n(static_cast<T>(broadcast_value), wrapper::traits::vector_128_tag{});
+
+            const float32x4x4_t bf =
             {
-                vsubq_f32(ta1.val[0], ta2.val[0]),
-                vsubq_f32(ta1.val[1], ta2.val[1]),
-                vsubq_f32(ta1.val[2], ta2.val[2]),
-                vsubq_f32(ta1.val[3], ta2.val[3]),
+                {
+                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
+                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(broadcast_value_vec))))), voffset2)), vscale2),
+                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
+                    vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
+                }
+            };
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);
+
+                const float32x4x4_t af =
+                {
+                    {
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
+                    }
+                };
+
+                const int32x4x4_t rf =
+                {
+                    {
+#ifdef __aarch64_
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#else  //__aarch64__
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#endif //__aarch64__
+                    }
+                };
+
+                const auto pa = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+                const auto pb = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+                wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
             }
-        };
 
-        const uint8x16_t result = vquantize(ta3, oq_info);
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
+                const float bfs   = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
+                *(output_ptr + x) = quantize<T>(is_broadcast_input_2 ? afs - bfs : bfs - afs, out->info()->quantization_info());
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        const float32x4_t vscale1  = vdupq_n_f32(iq1_info.scale);
+        const float32x4_t vscale2  = vdupq_n_f32(iq2_info.scale);
+        const int32x4_t   voffset1 = vdupq_n_s32(iq1_info.offset);
+        const int32x4_t   voffset2 = vdupq_n_s32(iq2_info.offset);
 
-        vst1q_u8(reinterpret_cast<qasymm8_t *>(output.ptr()), result);
-    },
-    input1, input2, output);
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto a = wrapper::vloadq(input1_ptr + x);
+                const auto b = wrapper::vloadq(input2_ptr + x);
+
+                const float32x4x4_t af =
+                {
+                    {
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),
+                    }
+                };
+
+                const float32x4x4_t bf =
+                {
+                    {
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
+                        vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),
+                    }
+                };
+
+                const int32x4x4_t rf =
+                {
+                    {
+#ifdef __aarch64__
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
+                        vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#else  //__aarch64__
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
+                        vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#endif //__aarch64__
+                    }
+                };
+
+                const auto pa = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
+                const auto pb = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
+                wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
+                const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
+
+                *(output_ptr + x) = quantize<T>((afs - bfs), out->info()->quantization_info());
+            }
+        },
+        input1, input2, output);
+    }
 }
 
-void sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sub_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
 {
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
+    ARM_COMPUTE_UNUSED(is_sat);
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const int  window_step_x         = 8;
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
 
     const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
     const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
     const UniformQuantizationInfo oq_info  = out->info()->quantization_info().uniform();
 
-    execute_window_loop(window, [&](const Coordinates &)
+    const float32x4_t vscale1    = vdupq_n_f32(iq1_info.scale);
+    const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
+    const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
+
+    if(is_broadcast_across_x)
     {
-        const float32x4x4_t ta1 = vdequantize(vld1q_s8(reinterpret_cast<const qasymm8_signed_t *>(input1.ptr())), iq1_info);
-        const float32x4x4_t ta2 = vdequantize(vld1q_s8(reinterpret_cast<const qasymm8_signed_t *>(input2.ptr())), iq2_info);
+        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
 
-        const float32x4x4_t ta3 =
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
         {
+            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
+
+            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+
+            const float32x4x2_t bf =
             {
-                vsubq_f32(ta1.val[0], ta2.val[0]),
-                vsubq_f32(ta1.val[1], ta2.val[1]),
-                vsubq_f32(ta1.val[2], ta2.val[2]),
-                vsubq_f32(ta1.val[3], ta2.val[3]),
+                {
+                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
+                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
+                }
+            };
+            const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const int16x8_t     a = vld1q_s16(non_broadcast_input_ptr + x);
+                const float32x4x2_t af =
+                {
+                    {
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
+                    }
+                };
+
+                const int32x4x4_t rf =
+                {
+                    {
+#ifdef __aarch64__
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+#else  //__aarch64__
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+#endif //__aarch64__
+                    }
+                };
+
+                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+                vst1q_s16(output_ptr + x, pa);
             }
-        };
 
-        const int8x16_t result = vquantize_signed(ta3, oq_info);
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+                *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-        vst1q_s8(reinterpret_cast<qasymm8_signed_t *>(output.ptr()), result);
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const int16x8_t a = vld1q_s16(input1_ptr + x);
+                const int16x8_t b = vld1q_s16(input2_ptr + x);
+
+                const float32x4x2_t af =
+                {
+                    {
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
+                    }
+                };
+
+                const float32x4x2_t bf =
+                {
+                    {
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
+                    }
+                };
+
+                const int32x4x2_t rf =
+                {
+                    {
+#ifdef __aarch64__
+                        vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+#else  //__aarch64__
+                        vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+                        vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+#endif //__aarch64__
+                    }
+                };
+
+                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+                vst1q_s16(output_ptr + x, pa);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+                *(output_ptr + x) = quantize_qsymm16((afs - bfs), out->info()->quantization_info());
+            }
+        },
+        input1, input2, output);
+    }
+}
+
+void sub_S16_U8_S16_impl(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat, bool is_swapped)
+{
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
+
+    const int  window_step_x  = 8;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+        if(!is_sat)
+        {
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin1 = wrapper::vloadq(input1_ptr + x);
+                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+                const auto res  = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2);
+                wrapper::vstore(output_ptr + x, res);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const auto res    = is_swapped ? static_cast<int16_t>(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast<int16_t>(*(input2_ptr + x));
+                *(output_ptr + x) = res;
+            }
+        }
+        else
+        {
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin1 = wrapper::vloadq(input1_ptr + x);
+                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+                const auto res  = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2);
+                wrapper::vstore(output_ptr + x, res);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const auto res    = is_swapped ? wrapper::sub_sat(static_cast<int16_t>(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
+                *(output_ptr + x) = res;
+            }
+        }
     },
     input1, input2, output);
 }
 
-void sub_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sub_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
 {
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();
-    const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info  = out->info()->quantization_info().uniform();
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const int16x8x2_t in1_s16 =
-        {
-            {
-                vld1q_s16(reinterpret_cast<const qsymm16_t *>(input1.ptr())),
-                vld1q_s16(reinterpret_cast<const qsymm16_t *>(input1.ptr()) + 8),
-            }
-        };
-        const int16x8x2_t in2_s16 =
-        {
-            {
-                vld1q_s16(reinterpret_cast<const qsymm16_t *>(input2.ptr())),
-                vld1q_s16(reinterpret_cast<const qsymm16_t *>(input2.ptr()) + 8),
-            }
-        };
-        const float32x4x4_t ta1 = vdequantize(in1_s16, iq1_info);
-        const float32x4x4_t ta2 = vdequantize(in2_s16, iq2_info);
-
-        const float32x4x4_t ta3 =
-        {
-            {
-                vsubq_f32(ta1.val[0], ta2.val[0]),
-                vsubq_f32(ta1.val[1], ta2.val[1]),
-                vsubq_f32(ta1.val[2], ta2.val[2]),
-                vsubq_f32(ta1.val[3], ta2.val[3]),
-            }
-        };
-
-        const int16x8x2_t result = vquantize_qsymm16(ta3, oq_info);
-
-        vst1q_s16(reinterpret_cast<qsymm16_t *>(output.ptr()), result.val[0]);
-        vst1q_s16(reinterpret_cast<qsymm16_t *>(output.ptr()) + 8, result.val[1]);
-    },
-    input1, input2, output);
+    sub_S16_U8_S16_impl(in1, in2, out, window, is_sat, false);
 }
 
-void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sub_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
 {
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const int16x8x2_t ta1 =
-        {
-            {
-                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
-                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8),
-            }
-        };
-        const int16x8x2_t ta2 =
-        {
-            {
-                vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr())),
-                vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8),
-            }
-        };
-
-        const int16x8x2_t ta3 =
-        {
-            {
-                vsubq_s16(ta1.val[0], ta2.val[0]),
-                vsubq_s16(ta1.val[1], ta2.val[1])
-            }
-        };
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3.val[0]);
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, ta3.val[1]);
-    },
-    input1, input2, output);
+    // Swap arguments
+    sub_S16_U8_S16_impl(in2, in1, out, window, is_sat, true);
 }
 
-void sub_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sub_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)
 {
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
 
-    execute_window_loop(window, [&](const Coordinates &)
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
+
+    const int  window_step_x  = 8;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(win, [&](const Coordinates &)
     {
-        const int16x8x2_t ta1 =
+        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+        if(!is_sat)
         {
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr())),
-                vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8),
+                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
+                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+                wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2));
             }
-        };
-        const int16x8x2_t ta2 =
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) - static_cast<int16_t>(*(input2_ptr + x));
+            }
+        }
+        else
         {
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr())),
-                vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8),
+                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
+                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
+                wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2));
             }
-        };
 
-        const int16x8x2_t ta3 =
-        {
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
             {
-                vqsubq_s16(ta1.val[0], ta2.val[0]),
-                vqsubq_s16(ta1.val[1], ta2.val[1])
+                *(output_ptr + x) = wrapper::sub_sat(static_cast<int16_t>(*(input1_ptr + x)),
+                                                     static_cast<int16_t>(*(input2_ptr + x)));
             }
-        };
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), ta3.val[0]);
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, ta3.val[1]);
-    },
-    input1, input2, output);
-}
-
-void sub_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const float16x8x2_t a =
-        {
-            {
-                vld1q_f16(reinterpret_cast<const float16_t *>(input1.ptr())),
-                vld1q_f16(reinterpret_cast<const float16_t *>(input1.ptr()) + 8),
-            }
-        };
-        const float16x8x2_t b =
-        {
-            {
-                vld1q_f16(reinterpret_cast<const float16_t *>(input2.ptr())),
-                vld1q_f16(reinterpret_cast<const float16_t *>(input2.ptr()) + 8),
-            }
-        };
-        const float16x8x2_t res =
-        {
-            {
-                vsubq_f16(a.val[0], b.val[0]),
-                vsubq_f16(a.val[1], b.val[1]),
-            }
-        };
-
-        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res.val[0]);
-        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()) + 8, res.val[1]);
-    },
-    input1, input2, output);
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_UNUSED(in1);
-    ARM_COMPUTE_UNUSED(in2);
-    ARM_COMPUTE_UNUSED(out);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_ERROR("Not supported, recompile the library with arch=arm64-v8.2-a");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-}
-
-void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const float32x4x4_t ta1 =
-        {
-            {
-                vld1q_f32(reinterpret_cast<const float *>(input1.ptr())),
-                vld1q_f32(reinterpret_cast<const float *>(input1.ptr()) + 4),
-                vld1q_f32(reinterpret_cast<const float *>(input1.ptr()) + 8),
-                vld1q_f32(reinterpret_cast<const float *>(input1.ptr()) + 12),
-            }
-        };
-        const float32x4x4_t ta2 =
-        {
-            {
-                vld1q_f32(reinterpret_cast<const float *>(input2.ptr())),
-                vld1q_f32(reinterpret_cast<const float *>(input2.ptr()) + 4),
-                vld1q_f32(reinterpret_cast<const float *>(input2.ptr()) + 8),
-                vld1q_f32(reinterpret_cast<const float *>(input2.ptr()) + 12),
-            }
-        };
-
-        const float32x4x4_t ta3 =
-        {
-            {
-                vsubq_f32(ta1.val[0], ta2.val[0]),
-                vsubq_f32(ta1.val[1], ta2.val[1]),
-                vsubq_f32(ta1.val[2], ta2.val[2]),
-                vsubq_f32(ta1.val[3], ta2.val[3]),
-            }
-        };
-
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()), ta3.val[0]);
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4, ta3.val[1]);
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8, ta3.val[2]);
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12, ta3.val[3]);
-    },
-    input1, input2, output);
-}
-void sub_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
-        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
-        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
-
-        a1_0 = vsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
-        a2_0 = vsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
-    },
-    input1, input2, output);
-}
-
-void sub_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
-        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
-        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input1.ptr()) + 8);
-
-        a1_0 = vqsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
-        a2_0 = vqsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
-    },
-    input1, input2, output);
-}
-
-void sub_wrap_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
-        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
-        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
-
-        a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
-        a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
-    },
-    input1, input2, output);
-}
-
-void sub_saturate_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t bv_0 = vld1q_u8(input1.ptr());
-        int16x8_t        a1_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
-        int16x8_t        a2_0 = vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8);
-
-        a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0);
-        a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0);
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
-    },
-    input1, input2, output);
-}
-
-void sub_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t av_0 = vld1q_u8(input1.ptr());
-        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
-
-        const int16x8_t a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
-                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
-        const int16x8_t a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
-                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
-    },
-    input1, input2, output);
-}
-
-void sub_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
-    Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t av_0 = vld1q_u8(input1.ptr());
-        const uint8x16_t bv_0 = vld1q_u8(input2.ptr());
-
-        const int16x8_t a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))),
-                                          vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))));
-        const int16x8_t a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))),
-                                          vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))));
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), a1_0);
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, a2_0);
+        }
     },
     input1, input2, output);
 }
@@ -519,141 +716,107 @@
     }
     return Status{};
 }
+} // namespace
 
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
+    : _func(nullptr), _policy(ConvertPolicy::WRAP)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(input1, input2);
+}
+
+void NEArithmeticSubtractionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy));
+
+    _policy = policy;
+
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
     const TensorShape &out_shape    = broadcast_pair.first;
     const ValidRegion &valid_region = broadcast_pair.second;
 
     // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(output, out_shape);
+    set_shape_if_empty(*output, out_shape);
 
-        if(input1.data_type() == DataType::S16 || input2.data_type() == DataType::S16)
-        {
-            set_format_if_unknown(output, Format::S16);
-        }
-        else if(input1.data_type() == DataType::F16 || input2.data_type() == DataType::F16)
-        {
-            set_format_if_unknown(output, Format::F16);
-        }
-        else if(input1.data_type() == DataType::F32 || input2.data_type() == DataType::F32)
-        {
-            set_format_if_unknown(output, Format::F32);
-        }
-        else if(input1.data_type() == DataType::QASYMM8 || input2.data_type() == DataType::QASYMM8)
-        {
-            set_data_type_if_unknown(output, DataType::QASYMM8);
-        }
-        else if(input1.data_type() == DataType::QASYMM8_SIGNED || input2.data_type() == DataType::QASYMM8_SIGNED)
-        {
-            set_data_type_if_unknown(output, DataType::QASYMM8_SIGNED);
-        }
-        else if(input1.data_type() == DataType::QSYMM16 || input2.data_type() == DataType::QSYMM16)
-        {
-            set_data_type_if_unknown(output, DataType::QSYMM16);
-        }
+    switch(input1->data_type())
+    {
+        case DataType::U8:
+            if(input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)
+            {
+                _func = &sub_same<uint8_t>;
+            }
+            else if(input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)
+            {
+                _func = &sub_U8_U8_S16;
+            }
+            else
+            {
+                _func = &sub_U8_S16_S16;
+            }
+            break;
+        case DataType::QASYMM8:
+            _func = &sub_quantized<uint8_t>;
+            set_data_type_if_unknown(*output, DataType::QASYMM8);
+            break;
+        case DataType::QASYMM8_SIGNED:
+            _func = &sub_quantized<int8_t>;
+            set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED);
+            break;
+        case DataType::S16:
+            if(input2->data_type() == DataType::U8)
+            {
+                _func = &sub_S16_U8_S16;
+            }
+            else
+            {
+                _func = &sub_same<int16_t>;
+            }
+            set_format_if_unknown(*output, Format::S16);
+            break;
+        case DataType::QSYMM16:
+            _func = &sub_QSYMM16_QSYMM16_QSYMM16;
+            set_data_type_if_unknown(*output, DataType::QSYMM16);
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = &sub_same<float16_t>;
+            set_format_if_unknown(*output, Format::F16);
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        case DataType::F32:
+            _func = &sub_same<float>;
+            set_format_if_unknown(*output, Format::F32);
+            break;
+        default:
+            _func = nullptr;
     }
 
-    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(input2);
+    // NEArithmeticSubtractionKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(valid_region);
+    Window win = calculate_max_window(valid_region, Steps());
 
-    AccessWindowHorizontal input1_access(&input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()
-    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), policy));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    static std::map<std::string, NEArithmeticSubtractionKernel::SubFunction *> map_function =
-    {
-        { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 },
-        { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 },
-        { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 },
-        { "sub_saturate_U8_U8_S16", &sub_saturate_U8_U8_S16 },
-        { "sub_saturate_QASYMM8_QASYMM8_QASYMM8", &sub_saturate_QAYSMM8_QAYSMM8_QAYSMM8 },
-        { "sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED", &sub_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED },
-        { "sub_saturate_QSYMM16_QSYMM16_QSYMM16", &sub_saturate_QSYMM16_QSYMM16_QSYMM16 },
-        { "sub_wrap_U8_S16_S16", &sub_wrap_U8_S16_S16 },
-        { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 },
-        { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 },
-        { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 },
-        { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 },
-        { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 },
-        { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 },
-        { "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 },
-        { "sub_wrap_F16_F16_F16", &sub_F16_F16_F16 },
-        { "sub_saturate_F16_F16_F16", &sub_F16_F16_F16 },
-    };
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    std::string function_to_call("sub_");
-    function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_";
-    function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
-    function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
-    function_to_call += string_from_data_type(output->info()->data_type());
-
-    auto it = map_function.find(function_to_call);
-
-    if(it != map_function.end())
-    {
-        _func = it->second;
-    }
-
-    INEKernel::configure(win_config.second);
+    INEKernel::configure(win);
 }
 
 Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
-
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
 
-void NEArithmeticSubtractionKernel::run(const Window &window, const ThreadInfo &info)
+void NEArithmeticSubtractionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(_input1, _input2, _output, window);
+    // Dispatch kernel
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC_0),
+             tensors.get_const_tensor(TensorType::ACL_SRC_1),
+             tensors.get_tensor(TensorType::ACL_DST),
+             window,
+             (_policy == ConvertPolicy::SATURATE));
 }
-
-BorderSize NEArithmeticSubtractionKernel::border_size() const
-{
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize{ 0, border, 0, 0 };
-}
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
index 5650b81..0ee6d0e 100644
--- a/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,17 +26,13 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include <cstdint>
-
 namespace arm_compute
 {
 namespace
@@ -145,21 +141,19 @@
 } // namespace
 
 NEBatchConcatenateLayerKernel::NEBatchConcatenateLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _batch_offset(0)
+    : _func(nullptr), _batch_offset(0)
 {
 }
 
-void NEBatchConcatenateLayerKernel::configure(const ITensor *input, unsigned int batch_offset, ITensor *output)
+void NEBatchConcatenateLayerKernel::configure(const ITensorInfo *input, unsigned int batch_offset, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), batch_offset, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, batch_offset, output));
 
     _func         = nullptr;
-    _input        = input;
-    _output       = output;
     _batch_offset = batch_offset;
 
-    switch(input->info()->data_type())
+    switch(input->data_type())
     {
         case DataType::S8:
         case DataType::U8:
@@ -182,10 +176,10 @@
     }
 
     // Configure kernel window
-    Window      win = calculate_max_window(*output->info(), Steps());
+    Window      win = calculate_max_window(*output, Steps());
     Coordinates coord;
-    coord.set_num_dimensions(output->info()->num_dimensions());
-    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
     INEKernel::configure(win);
 }
 
@@ -197,13 +191,16 @@
     return Status{};
 }
 
-void NEBatchConcatenateLayerKernel::run(const Window &window, const ThreadInfo &info)
+void NEBatchConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input, _output, _batch_offset, window);
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
+             tensors.get_tensor(TensorType::ACL_DST),
+             _batch_offset,
+             window);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 6bd30ee..0651cf2 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,10 +33,12 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+
 #include <map>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status
@@ -82,56 +84,41 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *var, ITensorInfo *gamma, ITensorInfo *beta)
 {
+    ARM_COMPUTE_UNUSED(mean, var, gamma, beta);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
     if(output != nullptr)
     {
-        // Output tensor auto initialization if not yet initialized
+        // Output auto initialization if not yet initialized
         auto_init_if_empty(*output, *input->clone());
+
+        // NEBatchNormalizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+        Coordinates coord;
+        coord.set_num_dimensions(output->num_dimensions());
+        output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
     }
 
-    unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access);
-
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed |= update_window_and_padding(win, output_access);
-        output_access.set_valid_region(win, input->valid_region());
-    }
-
-    // Mean, var, gamma and beta get parallelized for the NHWC case as they follow the channel dimension, which is along the first axis
-    if(input->data_layout() == DataLayout::NHWC)
-    {
-        AccessWindowHorizontal mean_access(mean, 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal var_access(var, 0, num_elems_processed_per_iteration);
-        window_changed |= update_window_and_padding(win, mean_access, var_access);
-
-        if(gamma != nullptr)
-        {
-            AccessWindowHorizontal gamma_access(gamma, 0, num_elems_processed_per_iteration);
-            window_changed |= update_window_and_padding(win, gamma_access);
-        }
-        if(beta != nullptr)
-        {
-            AccessWindowHorizontal beta_access(beta, 0, num_elems_processed_per_iteration);
-            window_changed |= update_window_and_padding(win, beta_access);
-        }
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
+    return std::make_pair(Status{}, win);
 }
 } //namespace
 
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window)
+template <typename T, bool fused_activation, typename F>
+void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &window)
 {
-    ARM_COMPUTE_UNUSED(window);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator input(_input, window);
-    Iterator output(_output, window);
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const int  window_step_x  = 16 / sizeof(T);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_to_use = window;
+    win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win_to_use);
+    Iterator output(_output, win_to_use);
 
     F activation_functor(_act_info);
 
@@ -139,196 +126,168 @@
     // Only compute denominator and NEON vectors once per feature map.
     int slice = -1;
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean  = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
-    float16x8_t       mean_vec    = vdupq_n_f16(0.0);
-    float16x8_t       var_vec     = vdupq_n_f16(0.0);
-    float16x8_t       gamma_vec   = vdupq_n_f16(1.0);
-    float16x8_t       beta_vec    = vdupq_n_f16(0.0);
-    float16x8_t       denominator = vdupq_n_f16(0.0);
-    const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
-    execute_window_loop(window, [&](const Coordinates & id)
+    T mean        = static_cast<T>(0);
+    T var         = static_cast<T>(0);
+    T gamma       = static_cast<T>(1);
+    T beta        = static_cast<T>(0);
+    T denominator = static_cast<T>(0);
+
+    auto       mean_vec        = wrapper::vdup_n(mean, ExactTagType{});
+    auto       var_vec         = wrapper::vdup_n(var, ExactTagType{});
+    auto       gamma_vec       = wrapper::vdup_n(gamma, ExactTagType{});
+    auto       beta_vec        = wrapper::vdup_n(beta, ExactTagType{});
+    auto       denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
+    const auto epsilon_vec     = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
+    execute_window_loop(win_to_use, [&](const Coordinates & id)
     {
+        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
         if(slice != id.z())
         {
-            // Conctruct vectors
-            mean_vec = vdupq_n_f16(*(input_mean + id.z()));
-            var_vec  = vdupq_n_f16(*(input_var + id.z()));
+            mean     = input_mean[id.z()];
+            var      = input_var[id.z()];
+            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+            var_vec  = wrapper::vdup_n(var, ExactTagType{});
             if(input_gamma != nullptr)
             {
-                gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
+                gamma     = input_gamma[id.z()];
+                gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
             }
             if(input_beta != nullptr)
             {
-                beta_vec = vdupq_n_f16(*(input_beta + id.z()));
+                beta     = input_beta[id.z()];
+                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
             }
 
             // Calculate denominator
-            denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
-            slice       = id.z();
+            denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+            denominator     = wrapper::vgetlane(denominator_vec, 0);
+            slice           = id.z();
         }
 
-        // Calculate x bar and store results
-        const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
-        const float16x8_t x_bar     = vmulq_f16(numerator, denominator);
-        float16x8_t       res       = vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec));
-
-        // Perform fused activation
-        if(fused_activation)
+        // Perform core calculations using vector operations
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            activation_functor(res);
+            // Calculate x bar
+            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+            const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
+            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+            // Perform fused activation
+            if(fused_activation)
+            {
+                activation_functor(res);
+            }
+
+            // Store results
+            wrapper::vstore(output_ptr + x, res);
         }
 
-        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const T numerator = input_ptr[x] - mean;
+            const T x_bar     = numerator * denominator;
+            T       res       = beta + x_bar * gamma;
+
+            // Perform fused activation
+            if(fused_activation)
+            {
+                activation_functor(res);
+            }
+
+            // Store results
+            *(output_ptr + x) = res;
+        }
     },
     input, output);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc(const Window &window)
+template <typename T, bool fused_activation, typename F>
+void NEBatchNormalizationLayerKernel::batch_normalization_nhwc(const Window &window)
 {
-    ARM_COMPUTE_UNUSED(window);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator input(_input, window);
-    Iterator output(_output, window);
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    const int  window_step_x  = 16 / sizeof(T);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win_collapsed);
+    Iterator output(_output, win_collapsed);
 
     F activation_functor(_act_info);
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean  = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var   = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
-    const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
-    execute_window_loop(window, [&](const Coordinates & id)
+    const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
-        // Conctruct vectors
-        const float16x8_t mean_vec  = vld1q_f16(input_mean + id.x());
-        const float16x8_t var_vec   = vld1q_f16(input_var + id.x());
-        const float16x8_t gamma_vec = (input_gamma != nullptr) ? vld1q_f16(input_gamma + id.x()) : vdupq_n_f16(1.0);
-        const float16x8_t beta_vec  = (input_beta != nullptr) ? vld1q_f16(input_beta + id.x()) : vdupq_n_f16(0.0);
-        // Calculate denominator
-        const float16x8_t denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
+        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
 
-        // Calculate x bar and store results
-        const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
-        const float16x8_t x_bar     = vmulq_f16(numerator, denominator);
-        float16x8_t       res       = vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec));
-
-        // Perform fused activation
-        if(fused_activation)
-        {
-            activation_functor(res);
-        }
-
-        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), res);
-    },
-    input, output);
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-}
-
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw(const Window &window)
-{
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    F activation_functor(_act_info);
-
-    // Hold information about the current feature map we are iterating.
-    // Only compute denominator and NEON vectors once per feature map.
-    int slice = -1;
-
-    const auto input_mean  = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    float32x4_t       mean_vec    = vdupq_n_f32(0.0);
-    float32x4_t       var_vec     = vdupq_n_f32(0.0);
-    float32x4_t       gamma_vec   = vdupq_n_f32(1.0);
-    float32x4_t       beta_vec    = vdupq_n_f32(0.0);
-    float32x4_t       denominator = vdupq_n_f32(0.0);
-    const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(slice != id.z())
+        // Perform core calculations using vector operations
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Conctruct vectors
-            mean_vec = vdupq_n_f32(*(input_mean + id.z()));
-            var_vec  = vdupq_n_f32(*(input_var + id.z()));
-            if(input_gamma != nullptr)
-            {
-                gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
-            }
-            if(input_beta != nullptr)
-            {
-                beta_vec = vdupq_n_f32(*(input_beta + id.z()));
-            }
+            const auto mean_vec  = wrapper::vloadq(input_mean + x);
+            const auto var_vec   = wrapper::vloadq(input_var + x);
+            const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
+            const auto beta_vec  = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
 
             // Calculate denominator
-            denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
-            slice       = id.z();
+            const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+            // Calculate x bar
+            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+            const auto x_bar     = wrapper::vmul(numerator, denominator);
+            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+            // Perform fused activation
+            if(fused_activation)
+            {
+                activation_functor(res);
+            }
+
+            // Store results
+            wrapper::vstore(output_ptr + x, res);
         }
 
-        // Calculate x bar
-        const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
-        const float32x4_t x_bar     = vmulq_f32(numerator, denominator);
-        float32x4_t       res       = vmlaq_f32(beta_vec, x_bar, gamma_vec);
-
-        // Perform fused activation
-        if(fused_activation)
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
         {
-            activation_functor(res);
+            // Conctruct vectors
+            const T gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+            const T beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
+
+            const T denominator = sqrt(input_var[x] + _epsilon);
+            const T numerator   = input_ptr[x] - input_mean[x];
+            const T x_bar       = numerator / denominator;
+            T       res         = beta + x_bar * gamma;
+
+            // Perform fused activation
+            if(fused_activation)
+            {
+                activation_functor(res);
+            }
+
+            // Store results
+            *reinterpret_cast<T *>(output_ptr + x) = res;
         }
-
-        // Store results
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
-    },
-    input, output);
-}
-
-template <bool fused_activation, typename F>
-void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc(const Window &window)
-{
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    F activation_functor(_act_info);
-
-    const auto input_mean  = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
-
-    const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Conctruct vectors
-        const float32x4_t mean_vec  = vld1q_f32(input_mean + id.x());
-        const float32x4_t var_vec   = vld1q_f32(input_var + id.x());
-        const float32x4_t gamma_vec = (input_gamma != nullptr) ? vld1q_f32(input_gamma + id.x()) : vdupq_n_f32(1.0);
-        const float32x4_t beta_vec  = (input_beta != nullptr) ? vld1q_f32(input_beta + id.x()) : vdupq_n_f32(0.0);
-        // Calculate denominator
-        const float32x4_t denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
-
-        // Calculate x bar
-        const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
-        const float32x4_t x_bar     = vmulq_f32(numerator, denominator);
-        float32x4_t       res       = vmlaq_f32(beta_vec, x_bar, gamma_vec);
-
-        // Perform fused activation
-        if(fused_activation)
-        {
-            activation_functor(res);
-        }
-
-        // Store results
-        vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
     },
     input, output);
 }
@@ -340,13 +299,13 @@
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false, ::detail::dummy<float16_t, 8>> :
-                    &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false, ::detail::dummy<float16_t, 8>>;
+            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, false, detail::dummy<float16_t, 8>> :
+                    &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<false, ::detail::dummy<float, 4>> :
-                    &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<false, ::detail::dummy<float, 4>>;
+            _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, false, detail::dummy<float, 4>> :
+                    &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, false, detail::dummy<float, 4>>;
             break;
         default:
             ARM_COMPUTE_ERROR("Element size not supported");
@@ -359,31 +318,31 @@
     // NCHW Fused Batched Normalization with activation functions : FP32
     static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
     {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::relu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::brelu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::lubrelu<float, 4>> }
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
     };
     // NHWC Fused Batched Normalization with activation functions : FP32
     static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nhwc =
     {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::relu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::brelu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::lubrelu<float, 4>> }
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::relu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::brelu<float, 4>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::lubrelu<float, 4>> }
     };
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     // NCHW Fused Batched Normalization with activation functions : FP16
     static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
     {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::relu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::brelu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<true, ::detail::lubrelu<float16_t, 8>> }
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
     };
     // NHWC Fused Batched Normalization with activation functions : FP16
     static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nhwc =
     {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::relu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::brelu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<true, ::detail::lubrelu<float16_t, 8>> }
+        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::relu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::brelu<float16_t, 8>> },
+        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::lubrelu<float16_t, 8>> }
     };
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
@@ -475,3 +434,4 @@
 
     (this->*_func)(window);
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index c4c0f01..eb28ce0 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,12 +25,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include <arm_neon.h>
-#include <cstdint>
 
 using namespace arm_compute::misc::shape_calculator;
 

diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index 71312a9..fa8332e 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 5791dcc..4da07f9 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 8aed9bb..591acf5 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index e2dcb95..b0aec40 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index ea8d47d..56444dc 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index 7ca5e3c..d5d03a9 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
index 094b778..0278bb0 100644
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
index 539154d..0de6c43 100644
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
index 61e1304..800c636 100644
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index 45f6fd9..88cd0ae 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index 4f1b1d8..6a07def 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
index 7a66b6c..1f07965 100644
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
index ab66360..97bb8cc 100644
--- a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
index 39e030e..d439f43 100644
--- a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp
index b154340..7103fa1 100644
--- a/src/core/NEON/kernels/NEConvolutionKernel.cpp
+++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NECopyKernel.cpp b/src/core/NEON/kernels/NECopyKernel.cpp
index 6bf4954..3d00139 100644
--- a/src/core/NEON/kernels/NECopyKernel.cpp
+++ b/src/core/NEON/kernels/NECopyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index 4257611..03bc9f0 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,6 +76,12 @@
     return vcvtq_f32_u32(vmovl_u16(wrapper::vload(ptr)));
 }
 
+template <>
+inline float32x4_t load_as_f32(uint8_t *ptr)
+{
+    return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
+}
+
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
 inline float32x4_t load_as_f32(float16_t *ptr)
@@ -267,6 +273,9 @@
         case DataType::S16:
             _in_bounds_crop_function = &in_bounds_crop_window<int16_t>;
             break;
+        case DataType::U8:
+            _in_bounds_crop_function = &in_bounds_crop_window<uint8_t>;
+            break;
         default:
             ARM_COMPUTE_ERROR("Datatype not supported");
     }
@@ -276,7 +285,7 @@
 {
     ARM_COMPUTE_UNUSED(extrapolation_value);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4);

diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
index 31b688c..cec0e1c 100644
--- a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp
+++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
index 3ac043a..6926ec1 100644
--- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -142,21 +142,19 @@
 } // namespace
 
 NEDepthConcatenateLayerKernel::NEDepthConcatenateLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _depth_offset(0)
+    : _func(nullptr), _depth_offset(0)
 {
 }
 
-void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output)
+void NEDepthConcatenateLayerKernel::configure(const ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), depth_offset, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, depth_offset, output));
 
     _func         = nullptr;
-    _input        = input;
-    _output       = output;
     _depth_offset = depth_offset;
 
-    switch(input->info()->data_type())
+    switch(input->data_type())
     {
         case DataType::QASYMM8:
             _func = &depth_concat<uint8_t>;
@@ -175,11 +173,11 @@
     }
 
     // Configure kernel window
-    Window      win = calculate_max_window(*output->info(), Steps());
+    Window      win = calculate_max_window(*output, Steps());
     Coordinates coord;
-    coord.set_num_dimensions(output->info()->num_dimensions());
+    coord.set_num_dimensions(output->num_dimensions());
 
-    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
     INEKernel::configure(win);
 }
 
@@ -191,13 +189,16 @@
     return Status{};
 }
 
-void NEDepthConcatenateLayerKernel::run(const Window &window, const ThreadInfo &info)
+void NEDepthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(_input, _output, _depth_offset, window);
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
+             tensors.get_tensor(TensorType::ACL_DST),
+             _depth_offset,
+             window);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
index 79dc2cb..5df3e3e 100644
--- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -849,7 +849,7 @@
                     const float16_t   scale_s = 1 << _shift;
                     const float16x8_t scale   = vdupq_n_f16(scale_s);
 
-                    /* Up-conversion F16 -> QASYMM8_SIGNED */
+                    /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
                     execute_window_loop(win, [&](const Coordinates &)
                     {
                         const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
@@ -872,7 +872,7 @@
                         // Compute left-over elements
                         for(; x < window_end_x; ++x)
                         {
-                            *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) * scale_s);
+                            *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
                         }
                     },
                     input, output);
@@ -884,7 +884,7 @@
                     const float16_t   scale_s = 1 << _shift;
                     const float16x8_t scale   = vdupq_n_f16(scale_s);
 
-                    /* Up-conversion F16 -> U8 */
+                    /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
                     execute_window_loop(win, [&](const Coordinates &)
                     {
                         const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
@@ -907,7 +907,7 @@
                         // Compute left-over elements
                         for(; x < window_end_x; ++x)
                         {
-                            *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) * scale_s);
+                            *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
                         }
 
                     },
@@ -1215,7 +1215,7 @@
                         // Compute left-over elements
                         for(; x < window_end_x; ++x)
                         {
-                            *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) * scale_s);
+                            *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
                         }
                     },
                     input, output);

diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index 3641a84..618a1ba 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 0be9d8f..134ebb0 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
index ef196ab..62b2531 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -427,12 +427,11 @@
     if(is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
 
     if(biases != nullptr)
@@ -454,6 +453,7 @@
     {
         const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index 947f257..fc0933b 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
index 1d7237a..ad590e9 100644
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
index e761815..c30dab2 100644
--- a/src/core/NEON/kernels/NEDilateKernel.cpp
+++ b/src/core/NEON/kernels/NEDilateKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,16 +30,9 @@
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
 
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 BorderSize NEDilateKernel::border_size() const
 {
     return BorderSize(1);
@@ -47,6 +40,10 @@
 
 void NEDilateKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
     _input  = input;
     _output = output;
 
@@ -126,3 +123,4 @@
     },
     in, out);
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 7f393d6..559b673 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -979,7 +979,7 @@
         //  |__________________|
         //  |     pad_bottom   |
         //  |******************|
-        const int max_offset = input_stride_z * input_depth - (input->info()->padding().bottom + input->info()->padding().top) * input_stride_y;
+        const int64_t max_offset = input_stride_z * input_depth - (input->info()->padding().bottom + input->info()->padding().top) * input_stride_y;
         execute_window_loop(window_k, [&](const Coordinates & id_k) // loop on the batch size
         {
 
@@ -1002,34 +1002,34 @@
                 for(int x = 0; x < input_width; x += num_elems_read_per_iteration)
                 {
                     // z == 0
-                    auto in_z   = static_cast<int>(id.z() * conv_stride_y - conv_pad_top);
+                    auto in_z   = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top);
                     in_z        = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
                     auto offset = y_offset + in_z * input_stride_z;
                     offset      = std::min(offset, max_offset);
                     convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 0 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
 
                     // z == 1
-                    in_z   = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 1);
+                    in_z   = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 1);
                     in_z   = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
                     offset = y_offset + in_z * input_stride_z;
                     offset = std::min(offset, max_offset);
                     convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 1 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
 
                     // z == 2
-                    in_z   = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 2);
+                    in_z   = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 2);
                     in_z   = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth));
                     offset = y_offset + in_z * input_stride_z;
                     offset = std::min(offset, max_offset);
                     convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 2 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
 
                     // z == 3
-                    in_z   = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 3);
+                    in_z   = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 3);
                     offset = y_offset + in_z * input_stride_z;
                     offset = std::min(offset, max_offset);
                     convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 3 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
 
                     // z == 4
-                    in_z   = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 4);
+                    in_z   = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 4);
                     offset = y_offset + in_z * input_stride_z;
                     convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 4 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3);
 

diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 2f106a3..2814c67 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -275,7 +275,7 @@
         }
 
         const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-        wrapper::vstore(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+        wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
     },
     in, out);
 }
@@ -326,7 +326,7 @@
         }
 
         const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-        wrapper::vstore(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+        wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
     },
     in, bi, out);
 }

diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
index 7b2b5e4..014a564 100644
--- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1055,13 +1055,13 @@
 }
 
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_func(const ITensor *input1, const ITensor *input2, ITensor *output,
+configure_func(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output,
                std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
 {
     std::string function_to_call("op_");
-    function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
-    function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
-    function_to_call += string_from_data_type(output->info()->data_type());
+    function_to_call += string_from_data_type(input1->data_type()) + "_";
+    function_to_call += string_from_data_type(input2->data_type()) + "_";
+    function_to_call += string_from_data_type(output->data_type());
 
     auto it = map_function.find(function_to_call);
 
@@ -1078,7 +1078,7 @@
 
 template <ArithmeticOperation op>
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_arithm_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+configure_arithm_func(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function =
     {
@@ -1097,7 +1097,7 @@
 
 template <ComparisonOperation op>
 std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)>
-configure_comp_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+configure_comp_func(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function =
     {
@@ -1140,41 +1140,38 @@
     return Status{};
 }
 
-void NEElementwiseOperationKernel::configure_common(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEElementwiseOperationKernel::configure_common(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
     // Configure kernel window
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
     const TensorShape &out_shape    = broadcast_pair.first;
     const ValidRegion &valid_region = broadcast_pair.second;
 
     // Auto initialize output if not initialized
-    auto_init_if_empty(*output->info(), out_shape, 1, input1->info()->data_type());
+    auto_init_if_empty(*output, out_shape, 1, input1->data_type());
 
     Window win = calculate_max_window(valid_region);
 
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
     INEKernel::configure(win);
 }
 
-void NEElementwiseOperationKernel::run(const Window &window, const ThreadInfo &info)
+void NEElementwiseOperationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info, window);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_function == nullptr);
-    _function(_input1, _input2, _output, window);
+    _function(tensors.get_const_tensor(TensorType::ACL_SRC_0),
+              tensors.get_const_tensor(TensorType::ACL_SRC_1),
+              tensors.get_tensor(TensorType::ACL_DST), window);
 }
 
 /** Arithmetic operators (min, max, squared_diff) */
-
-void NEArithmeticOperationKernel::configure(ArithmeticOperation op, const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEArithmeticOperationKernel::configure(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
     switch(op)
     {
@@ -1215,9 +1212,9 @@
 
 /** The division operator */
 
-void NEDivisionOperationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEDivisionOperationKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
     _function = configure_arithm_func<ArithmeticOperation::DIV>(input1, input2, output);
 }
@@ -1236,9 +1233,9 @@
 }
 
 /** The power operator */
-void NEPowerOperationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEPowerOperationKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
     _function = configure_arithm_func<ArithmeticOperation::POWER>(input1, input2, output);
 }
@@ -1257,10 +1254,9 @@
 }
 
 /** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-
-void NEComparisonOperationKernel::configure(ComparisonOperation op, const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEComparisonOperationKernel::configure(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
     switch(op)
     {

diff --git a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
index 5d3af3b..747bd41 100644
--- a/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,30 +26,17 @@
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "support/ToolchainSupport.h"
 
-#include <algorithm>
-#include <arm_neon.h>
-#include <cstdint>
-#include <map>
-#include <string>
-
 namespace arm_compute
 {
-class Coordinates;
-
 namespace
 {
-template <ElementWiseUnary op, typename ScalarType>
-inline ScalarType elementwise_op_scalar(const ScalarType &a)
+template <typename ScalarType>
+inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
 {
     switch(op)
     {
@@ -72,9 +59,8 @@
     }
 }
 
-/* Elementwise operations that are supported for float */
-template <ElementWiseUnary op, typename ScalarType, bool is_float, typename VectorType, typename std::enable_if<is_float, int>::type = 0>
-inline VectorType elementwise_op(const VectorType &a)
+template <typename ScalarType, typename VectorType>
+inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
 {
     switch(op)
     {
@@ -96,24 +82,10 @@
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }
 }
+} // namespace
 
-/* Elementwise operations that are supported for non floats */
-template < ElementWiseUnary op, typename ScalarType, bool is_float, typename VectorType, typename std::enable_if < !is_float, int >::type = 0 >
-inline VectorType elementwise_op(const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::NEG:
-            return wrapper::vneg(a);
-        case ElementWiseUnary::ABS:
-            return wrapper::vabs(a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
-}
-
-template <ElementWiseUnary op, typename ScalarType, bool is_float>
-void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
+template <typename ScalarType>
+void NEElementwiseUnaryKernel::elementwise_op(const Window &window)
 {
     const int  window_step_x  = 16 / sizeof(ScalarType);
     const auto window_start_x = static_cast<int>(window.x().start());
@@ -122,8 +94,8 @@
     Window win = window;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(in, win);
-    Iterator output(out, win);
+    Iterator input(_input, win);
+    Iterator output(_output, win);
 
     execute_window_loop(win, [&](const Coordinates &)
     {
@@ -133,55 +105,24 @@
         int x = window_start_x;
         for(; x <= window_end_x - window_step_x; x += window_step_x)
         {
-            wrapper::vstore(output_ptr + x, elementwise_op<op, ScalarType, is_float>(wrapper::vloadq(input_ptr + x)));
+            wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(_op, wrapper::vloadq(input_ptr + x)));
         }
         for(; x < window_end_x; ++x)
         {
-            *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
+            *(output_ptr + x) = elementwise_op_scalar_imp(_op, *(input_ptr + x));
         }
     },
     input, output);
 }
 
-template <ElementWiseUnary op>
-std::function<void(const ITensor *input, ITensor *output, const Window &window)>
-configure_func(const ITensor *input, ITensor *output)
-{
-    std::string function_to_call("op_");
-    function_to_call += string_from_data_type(input->info()->data_type()) + "_";
-    function_to_call += string_from_data_type(output->info()->data_type());
-
-    static std::map<std::string, NEElementwiseUnaryKernel::ElementwiseUnaryFunction *> map_function =
-    {
-        { "op_F32_F32", &elementwise_op<op, float, true> },
-        { "op_S32_S32", &elementwise_op<op, int32_t, false> },
-    };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    map_function["op_F16_F16"] = &elementwise_op<op, float16_t, true>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-    auto it = map_function.find(function_to_call);
-
-    if(it != map_function.end())
-    {
-        auto func = it->second;
-        return [func](const ITensor * input, ITensor * output, const Window & window)
-        {
-            func(input, output, window);
-        };
-    }
-    return nullptr;
-}
-} // namespace
-
 NEElementwiseUnaryKernel::NEElementwiseUnaryKernel()
-    : _function(nullptr), _input(nullptr), _output(nullptr)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _op()
 {
 }
 
 void NEElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(op, *input->info(), *output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate(op, input->info(), output->info()));
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Configure kernel window
@@ -196,69 +137,54 @@
 
     _input  = input;
     _output = output;
+    _op     = op;
 
     INEKernel::configure(win);
 
-    switch(op)
+    switch(input->info()->data_type())
     {
-        case ElementWiseUnary::RSQRT:
-            _function = configure_func<ElementWiseUnary::RSQRT>(input, output);
+        case DataType::F32:
+            _func = &NEElementwiseUnaryKernel::elementwise_op<float>;
             break;
-        case ElementWiseUnary::EXP:
-            _function = configure_func<ElementWiseUnary::EXP>(input, output);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = &NEElementwiseUnaryKernel::elementwise_op<float16_t>;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             break;
-        case ElementWiseUnary::NEG:
-            _function = configure_func<ElementWiseUnary::NEG>(input, output);
-            break;
-        case ElementWiseUnary::LOG:
-            _function = configure_func<ElementWiseUnary::LOG>(input, output);
-            break;
-        case ElementWiseUnary::ABS:
-            _function = configure_func<ElementWiseUnary::ABS>(input, output);
-            break;
-        case ElementWiseUnary::ROUND:
-            _function = configure_func<ElementWiseUnary::ROUND>(input, output);
-            break;
-        case ElementWiseUnary::SIN:
-            _function = configure_func<ElementWiseUnary::SIN>(input, output);
+        case DataType::S32:
+            _func = &NEElementwiseUnaryKernel::elementwise_op<int32_t>;
             break;
         default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+            ARM_COMPUTE_ERROR("DataType not supported");
     }
 }
 
-Status NEElementwiseUnaryKernel::validate_arguments(ElementWiseUnary op, const ITensorInfo &input, const ITensorInfo &output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-    switch(op)
-    {
-        case ElementWiseUnary::EXP:
-        case ElementWiseUnary::RSQRT:
-        case ElementWiseUnary::LOG:
-        case ElementWiseUnary::ROUND:
-        case ElementWiseUnary::SIN:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32);
-            break;
-        case ElementWiseUnary::NEG:
-        case ElementWiseUnary::ABS:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32, DataType::S32);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
-    }
-    // Validate in case of configured output
-    if(output.total_size() > 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
-    }
-
-    return Status{};
-}
-
 Status NEElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(op, *input, *output));
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    switch(op)
+    {
+        case ElementWiseUnary::EXP:
+        case ElementWiseUnary::RSQRT:
+        case ElementWiseUnary::LOG:
+        case ElementWiseUnary::ROUND:
+        case ElementWiseUnary::SIN:
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+            break;
+        case ElementWiseUnary::NEG:
+        case ElementWiseUnary::ABS:
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::S32);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
+    }
+    // Validate in case of configured output
+    if(output->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
     return Status{};
 }
 
@@ -267,7 +193,7 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_function == nullptr);
-    _function(_input, _output, window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    (this->*_func)(window);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
index 2a538ec..4b93c3b 100644
--- a/src/core/NEON/kernels/NEErodeKernel.cpp
+++ b/src/core/NEON/kernels/NEErodeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,16 +30,9 @@
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
 
 namespace arm_compute
 {
-class Coordinates;
-} // namespace arm_compute
-
 BorderSize NEErodeKernel::border_size() const
 {
     return BorderSize(1);
@@ -47,6 +40,10 @@
 
 void NEErodeKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
     _input  = input;
     _output = output;
 
@@ -126,3 +123,4 @@
     },
     in, out);
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
index cf77345..d5b20d2 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index 148bbe9..c041b4c 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index 56703ba..ea2831f 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
index 81bcc8b..7b1d81e 100644
--- a/src/core/NEON/kernels/NEFastCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEFillArrayKernel.cpp b/src/core/NEON/kernels/NEFillArrayKernel.cpp
index e59d75d..6b22dad 100644
--- a/src/core/NEON/kernels/NEFillArrayKernel.cpp
+++ b/src/core/NEON/kernels/NEFillArrayKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 75d46c6..dbaec83 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
deleted file mode 100644
index 50060b2..0000000
--- a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp
+++ /dev/null

@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-NEFillInnerBorderKernel::NEFillInnerBorderKernel()
-    : _tensor(nullptr), _border_size(0), _constant_border_value(static_cast<float>(0.f))
-{
-}
-
-void NEFillInnerBorderKernel::configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::S32, DataType::F32);
-
-    _tensor                = input;
-    _border_size           = border_size;
-    _constant_border_value = constant_border_value;
-
-    Window win;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(Window::DimY, Window::Dimension(0, 1, 1));
-    win.use_tensor_dimensions(_tensor->info()->tensor_shape(), Window::DimZ);
-    INEKernel::configure(win);
-}
-
-void NEFillInnerBorderKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    // If there is no border: early exit
-    if(_border_size.empty())
-    {
-        return;
-    }
-
-    switch(_tensor->info()->data_type())
-    {
-        case DataType::U8:
-            fill_value_single_channel<uint8_t>(window);
-            break;
-        case DataType::S16:
-            fill_value_single_channel<int16_t>(window);
-            break;
-        case DataType::S32:
-            fill_value_single_channel<int32_t>(window);
-            break;
-        case DataType::F32:
-            static_assert(sizeof(float) == 4, "Float must be 32 bit");
-            fill_value_single_channel<float>(window);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not handled");
-            break;
-    }
-}
-
-template <typename T>
-void NEFillInnerBorderKernel::fill_value_single_channel(const Window &window)
-{
-    const size_t stride = _tensor->info()->strides_in_bytes()[1];
-    const size_t width  = _tensor->info()->dimension(0);
-    const size_t height = _tensor->info()->dimension(1);
-
-    T constant_border_value;
-    _constant_border_value.get(constant_border_value);
-
-    // Left and right border
-    // All X values are set at once
-    Window vertical(window);
-    vertical.set(Window::DimY, Window::Dimension(0, height, 1));
-
-    Iterator vertical_it(_tensor, vertical);
-
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        std::fill_n(reinterpret_cast<T *>(vertical_it.ptr()), _border_size.left, constant_border_value);
-        std::fill_n(reinterpret_cast<T *>(vertical_it.ptr()) + width - _border_size.right, _border_size.right, constant_border_value);
-    },
-    vertical_it);
-
-    // Top and bottom border
-    // All values are set at once
-    Iterator horizontal_it(_tensor, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        for(size_t i = 0; i < _border_size.top; ++i)
-        {
-            std::fill_n(reinterpret_cast<T *>(horizontal_it.ptr() + i * stride), width, constant_border_value);
-        }
-
-        for(size_t i = 0; i < _border_size.bottom; ++i)
-        {
-            std::fill_n(reinterpret_cast<T *>(horizontal_it.ptr() + (height - i - 1) * stride), width, constant_border_value);
-        }
-    },
-    horizontal_it);
-}

diff --git a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
index a48601f..35ebc5b 100644
--- a/src/core/NEON/kernels/NEFlattenLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEFlattenLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,19 +33,17 @@
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
-#include <arm_neon.h>
-
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace misc::shape_calculator;
 
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -135,3 +133,4 @@
     }
     while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEFloorKernel.cpp b/src/core/NEON/kernels/NEFloorKernel.cpp
index 99dc31e..f078134 100644
--- a/src/core/NEON/kernels/NEFloorKernel.cpp
+++ b/src/core/NEON/kernels/NEFloorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 6e7e5ab..282b1a6 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include "utils/TypePrinter.h"
 #include <map>
 
 namespace arm_compute

diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index 8f73bdb..3d17831 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 10336e5..c5d7f10 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index 2293926..db6cb10 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,10 +123,9 @@
     return std::make_pair(err, win);
 }
 
-template <bool is_gemm3d>
 void run_offset_contribution(const Window &window,
                              ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
-                             int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col)
+                             int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d)
 {
     Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
 
@@ -398,12 +397,5 @@
                                    && _mm_result->info()->num_dimensions() > 1
                                    && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x();
 
-    if(reinterpret_as_3d)
-    {
-        run_offset_contribution<true>(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col);
-    }
-    else
-    {
-        run_offset_contribution<false>(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col);
-    }
-}
\ No newline at end of file
+    run_offset_contribution(window, _mm_result, _vector_sum_col, _vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
+}

diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index 31414e3..e9332b2 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -155,8 +155,7 @@
     };
 }
 
-template <bool    is_bounded_relu>
-inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8)
+inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -193,8 +192,7 @@
     return out_u8;
 }
 
-template <bool   is_bounded_relu>
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -231,8 +229,7 @@
     return out_s8;
 }
 
-template <bool   is_bounded_relu>
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -307,13 +304,13 @@
     return bias_it;
 }
 
-template <typename VT, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
+template <typename VT>
 inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
                                                         const int32x4_t result_offset_s32, const int32x4_t result_shift_s32,
                                                         typename VT::vtype min_vec, typename VT::vtype max_vec,
                                                         int32_t a_offset, int32_t b_offset, int32_t k_offset,
                                                         int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                        int window_step_x, int window_start_x, int window_end_x)
+                                                        int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
 {
     int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
     if(!is_fixed_point)
@@ -355,12 +352,12 @@
         if(is_fixed_point)
         {
             wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec));
+                            finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
         }
         else
         {
             wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization_floating_point<is_bounded_relu>(in_s32, result_shift_s32, min_vec, max_vec));
+                            finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
         }
     }
     // Compute left-over elements
@@ -380,9 +377,9 @@
         if(is_fixed_point)
         {
             // Finalize and store the result
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset,
-                                                                                                               static_cast<typename VT::stype>(min_bound),
-                                                                                                               static_cast<typename VT::stype>(max_bound));
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset,
+                                                                                              static_cast<typename VT::stype>(min_bound),
+                                                                                              static_cast<typename VT::stype>(max_bound), is_bounded_relu);
         }
         else
         {
@@ -400,12 +397,11 @@
     }
 }
 
-template <bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
 inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
                                                              const int32_t *result_multipliers, const int32_t *result_shifts,
                                                              const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
                                                              int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                             int window_step_x, int window_start_x, int window_end_x)
+                                                             int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
 {
     int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
     if(!is_fixed_point)
@@ -435,11 +431,11 @@
 
         if(is_fixed_point)
         {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm<is_bounded_relu>(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8));
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu));
         }
         else
         {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point<is_bounded_relu>(in_s32, load(result_shifts, x), min_s8, max_s8));
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
         }
     }
     // Compute left-over elements
@@ -459,7 +455,7 @@
         if(is_fixed_point)
         {
             // Finalize and store the result
-            *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound));
+            *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
         }
         else
         {
@@ -476,11 +472,11 @@
     }
 }
 
-template <typename T, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
+template <typename T>
 void run_offset_contribution_output_stage(const Window &window,
                                           const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
                                           int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
-                                          GEMMLowpOutputStageInfo output_stage)
+                                          GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
 {
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
     using Typer        = VectorTyper<T>;
@@ -533,13 +529,13 @@
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer, true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
-                                                                                                                      mm_result_it,
-                                                                                                                      out_it,
-                                                                                                                      result_offset_s32, result_shift_s32,
-                                                                                                                      min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                      multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                      window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
+                                                                   mm_result_it,
+                                                                   out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
@@ -551,11 +547,11 @@
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer, true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                                                                       result_offset_s32, result_shift_s32,
-                                                                                                                       min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                       multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                       window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
         }
@@ -576,12 +572,12 @@
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer, false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                                                                       out_it,
-                                                                                                                       result_offset_s32, result_shift_s32,
-                                                                                                                       min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                       multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                       window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                                                                   out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point);
             },
             vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
@@ -592,11 +588,11 @@
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer, false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                                                                        result_offset_s32, result_shift_s32,
-                                                                                                                        min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                        multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                        window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point);
             },
             vector_sum_row_it, mm_result_it, out_it);
         }
@@ -617,12 +613,12 @@
             {
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window<Typer, true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                                                                       out_it,
-                                                                                                                       result_offset_s32, result_shift_s32,
-                                                                                                                       min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                       multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                       window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                                                                   out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
@@ -632,11 +628,11 @@
             {
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window<Typer, true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                                                                        result_offset_s32, result_shift_s32,
-                                                                                                                        min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                        multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                        window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, mm_result_it, out_it);
         }
@@ -648,11 +644,11 @@
             Iterator bias_it = get_bias_it(collapsed_window, bias);
             execute_window_loop(collapsed_window, [&](const Coordinates &)
             {
-                run_offset_contribution_output_stage_window<Typer, false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                                                                        result_offset_s32, result_shift_s32,
-                                                                                                                        min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                        multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                        window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point);
             },
             bias_it, mm_result_it, out_it);
         }
@@ -660,11 +656,11 @@
         {
             execute_window_loop(collapsed_window, [&](const Coordinates &)
             {
-                run_offset_contribution_output_stage_window<Typer, false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                                                                         result_offset_s32, result_shift_s32,
-                                                                                                                         min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                                                                         multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                         window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it,
+                                                                   result_offset_s32, result_shift_s32,
+                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                   multiplier, shift, offset, min_bound, max_bound,
+                                                                   window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point);
             },
             mm_result_it, out_it);
         }
@@ -672,11 +668,10 @@
     }
 }
 
-template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
 void run_offset_contribution_output_stage_symm(const Window &window,
                                                const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
                                                int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
-                                               GEMMLowpOutputStageInfo output_stage)
+                                               GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
 {
     ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
 
@@ -720,11 +715,11 @@
             {
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window_symm<true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                                                              result_multipliers, result_shifts,
-                                                                                                              result_offset_s32, min_s8, max_s8,
-                                                                                                              a_offset, offset, min_bound, max_bound,
-                                                                                                              window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                 result_multipliers, result_shifts,
+                                                                 result_offset_s32, min_s8, max_s8,
+                                                                 a_offset, offset, min_bound, max_bound,
+                                                                 window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
@@ -734,11 +729,11 @@
             {
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window_symm<true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
-                                                                                                               result_multipliers, result_shifts,
-                                                                                                               result_offset_s32, min_s8, max_s8,
-                                                                                                               a_offset, offset, min_bound, max_bound,
-                                                                                                               window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
+                                                                 result_multipliers, result_shifts,
+                                                                 result_offset_s32, min_s8, max_s8,
+                                                                 a_offset, offset, min_bound, max_bound,
+                                                                 window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
             },
             vector_sum_col_it, mm_result_it, out_it);
         }
@@ -750,11 +745,11 @@
             Iterator bias_it = get_bias_it(collapsed_window, bias);
             execute_window_loop(collapsed_window, [&](const Coordinates &)
             {
-                run_offset_contribution_output_stage_window_symm<false, true, is_bounded_relu, is_fixed_point>(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                                                               result_multipliers, result_shifts,
-                                                                                                               result_offset_s32, min_s8, max_s8,
-                                                                                                               a_offset, offset, min_bound, max_bound,
-                                                                                                               window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                 result_multipliers, result_shifts,
+                                                                 result_offset_s32, min_s8, max_s8,
+                                                                 a_offset, offset, min_bound, max_bound,
+                                                                 window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point);
             },
             bias_it, mm_result_it, out_it);
         }
@@ -762,11 +757,11 @@
         {
             execute_window_loop(collapsed_window, [&](const Coordinates &)
             {
-                run_offset_contribution_output_stage_window_symm<false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, mm_result_it, out_it,
-                                                                                                                result_multipliers, result_shifts,
-                                                                                                                result_offset_s32, min_s8, max_s8,
-                                                                                                                a_offset, offset, min_bound, max_bound,
-                                                                                                                window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it,
+                                                                 result_multipliers, result_shifts,
+                                                                 result_offset_s32, min_s8, max_s8,
+                                                                 a_offset, offset, min_bound, max_bound,
+                                                                 window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point);
             },
             mm_result_it, out_it);
         }
@@ -860,81 +855,10 @@
 
     return std::make_pair(Status{}, win);
 }
-
-NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction
-get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage)
-{
-    static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function_qasymm =
-    {
-        { 0, &run_offset_contribution_output_stage<uint8_t, false, false, false> },
-        { 1, &run_offset_contribution_output_stage<uint8_t, true, false, false> },
-        { 2, &run_offset_contribution_output_stage<uint8_t, false, true, false> },
-        { 3, &run_offset_contribution_output_stage<uint8_t, true, true, false> },
-        { 4, &run_offset_contribution_output_stage<uint8_t, false, false, true> },
-        { 5, &run_offset_contribution_output_stage<uint8_t, true, false, true> },
-        { 6, &run_offset_contribution_output_stage<uint8_t, false, true, true> },
-        { 7, &run_offset_contribution_output_stage<uint8_t, true, true, true> },
-        { 8, &run_offset_contribution_output_stage<int8_t, false, false, false> },
-        { 9, &run_offset_contribution_output_stage<int8_t, true, false, false> },
-        { 10, &run_offset_contribution_output_stage<int8_t, false, true, false> },
-        { 11, &run_offset_contribution_output_stage<int8_t, true, true, false> },
-        { 12, &run_offset_contribution_output_stage<int8_t, false, false, true> },
-        { 13, &run_offset_contribution_output_stage<int8_t, true, false, true> },
-        { 14, &run_offset_contribution_output_stage<int8_t, false, true, true> },
-        { 15, &run_offset_contribution_output_stage<int8_t, true, true, true> },
-    };
-
-    static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function_qsymm =
-    {
-        { 0, &run_offset_contribution_output_stage_symm<false, false, false> },
-        { 1, &run_offset_contribution_output_stage_symm<true, false, false> },
-        { 2, &run_offset_contribution_output_stage_symm<false, true, false> },
-        { 3, &run_offset_contribution_output_stage_symm<true, true, false> },
-        { 4, &run_offset_contribution_output_stage_symm<false, false, true> },
-        { 5, &run_offset_contribution_output_stage_symm<true, false, true> },
-        { 6, &run_offset_contribution_output_stage_symm<false, true, true> },
-        { 7, &run_offset_contribution_output_stage_symm<true, true, true> }
-    };
-
-    // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
-
-    // Check if we need to clamp the result using min and max
-    PixelValue type_min{};
-    PixelValue type_max{};
-    std::tie(type_min, type_max) = get_min_max(output->info()->data_type());
-    int32_t    type_min_int    = type_min.get<int32_t>();
-    int32_t    type_max_int    = type_max.get<int32_t>();
-    const bool is_bounded_relu = !(output_stage.gemmlowp_min_bound <= type_min_int && output_stage.gemmlowp_max_bound >= type_max_int);
-
-    // Check if we need to perform fixed point requantization
-    const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
-
-    // Check if symmetric per-channel execution
-    const bool is_signed = output->info()->data_type() == DataType::QASYMM8_SIGNED;
-
-    // Check if symmetric per-channel execution
-    const bool is_symm = output_stage.is_quantized_per_channel;
-
-    // key acts as a bitset, setting the first bit on reinterpret_as_3d,
-    // the second on is_bounded_relu, and the third on is_fixed_point.
-    uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2);
-    if(is_symm)
-    {
-        return map_function_qsymm.find(key)->second;
-    }
-    else
-    {
-        key |= ((is_signed ? 1UL : 0UL) << 3);
-        return map_function_qasymm.find(key)->second;
-    }
-}
 } // namespace
 
 NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageKernel()
-    : _function(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true),
+    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _mm_result(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true),
       _output_stage(GEMMLowpOutputStageInfo())
 
 {
@@ -977,8 +901,6 @@
     auto win_config = validate_and_configure_window(mm_result->info(), output->info());
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
-
-    _function = get_configured_function(mm_result, vector_sum_row, output, output_stage);
 }
 
 Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
@@ -996,7 +918,46 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    _function(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage);
+
+    PixelValue type_min{};
+    PixelValue type_max{};
+    std::tie(type_min, type_max) = get_min_max(_output->info()->data_type());
+    int32_t type_min_int = type_min.get<int32_t>();
+    int32_t type_max_int = type_max.get<int32_t>();
+
+    const bool reinterpret_as_3d = _vector_sum_row != nullptr
+                                   && _mm_result->info()->num_dimensions() > 1
+                                   && _mm_result->info()->tensor_shape().y() != _vector_sum_row->info()->tensor_shape().x();
+
+    const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
+
+    // Check if we need to perform fixed point requantization
+    const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
+
+    // Check if symmetric per-channel execution
+    const bool is_signed = _output->info()->data_type() == DataType::QASYMM8_SIGNED;
+
+    // Check if symmetric per-channel execution
+    const bool is_symm = _output_stage.is_quantized_per_channel;
+
+    if(is_symm)
+    {
+        run_offset_contribution_output_stage_symm(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+                                                  reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+    }
+    else
+    {
+        if(is_signed)
+        {
+            run_offset_contribution_output_stage<int8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+                                                         reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+        }
+        else
+        {
+            run_offset_contribution_output_stage<uint8_t>(window, _mm_result, _vector_sum_col, _vector_sum_row, _bias, _output, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, _output_stage,
+                                                          reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+        }
+    }
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
index 80ba2af..458b94b 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
index 0580071..44d5565 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index b8ca17e..a0a5c5d 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -140,7 +140,7 @@
                 in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
 
                 vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8));
+                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
             }
 
             // Compute left-over elements
@@ -152,8 +152,8 @@
                 // Add bias
                 in_value += bias_value;
                 // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                                    static_cast<int8_t>(_min), static_cast<int8_t>(_max));
+                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
             }
         },
         in, out, bias);
@@ -177,7 +177,7 @@
                 };
 
                 vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8));
+                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
             }
 
             // Compute left-over elements
@@ -186,8 +186,8 @@
                 const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
                 // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                                    static_cast<int8_t>(_min), static_cast<int8_t>(_max));
+                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
             }
         },
         in, out);
@@ -242,4 +242,4 @@
 
     (this->*_func)(window);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 4a9d2f7..a926903 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -145,7 +145,7 @@
                 in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
                 in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
 
-                vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));
+                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
             }
 
             // Compute left-over elements
@@ -157,7 +157,7 @@
                 // Add bias
                 in_value += bias_value;
                 // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
+                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
             }
         },
         in, out, bias);
@@ -180,7 +180,7 @@
                     }
                 };
 
-                vst1q_u8(out.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8));
+                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
             }
 
             // Compute left-over elements
@@ -189,7 +189,7 @@
                 const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
                 // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max));
+                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
             }
         },
         in, out);
@@ -244,4 +244,4 @@
 
     (this->*_func)(window);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 1acdb1e..2945307 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,10 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cstddef>
-#include <cstdint>
 
 namespace arm_compute
 {
@@ -45,7 +36,7 @@
 Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
     if(output->total_size() > 0)
     {

diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
deleted file mode 100644
index 5ac2323..0000000
--- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
+++ /dev/null

@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace
-{
-inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(accum);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
-    ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0));
-
-    return Status{};
-}
-
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases)
-{
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
-
-    bool window_changed = update_window_and_padding(win,
-                                                    AccessWindowHorizontal(accum, 0, num_elems_processed_per_iteration),
-                                                    AccessWindowStatic(biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration), biases->tensor_shape().y()));
-
-    AccessWindowHorizontal output_access(accum, 0, num_elems_processed_per_iteration);
-
-    // Set the valid region for the accum tensor
-    Coordinates coord;
-    coord.set_num_dimensions(accum->num_dimensions());
-    output_access.set_valid_region(win, ValidRegion(coord, accum->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel()
-    : _accum(nullptr), _biases(nullptr)
-{
-}
-
-void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
-
-    _biases = biases;
-    _accum  = accum;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(accum->info(), biases->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NEGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum, const ITensorInfo *biases)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(), biases->clone().get()).first);
-
-    return Status{};
-}
-
-void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    Window win_biases;
-    win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
-    win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    Iterator in0_out(_accum, window);
-    Iterator in1(_biases, win_biases);
-
-    switch(_accum->info()->data_type())
-    {
-        case DataType::F32:
-        {
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                const float32x4x4_t accum  = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
-                const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
-                const float32x4x4_t res =
-                {
-                    {
-                        vaddq_f32(accum.val[0], biases.val[0]),
-                        vaddq_f32(accum.val[1], biases.val[1]),
-                        vaddq_f32(accum.val[2], biases.val[2]),
-                        vaddq_f32(accum.val[3], biases.val[3])
-                    }
-                };
-
-                vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res);
-            },
-            in0_out, in1);
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                const float16x8x2_t accum  = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr()));
-                const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr()));
-                const float16x8x2_t res =
-                {
-                    {
-                        vaddq_f16(accum.val[0], biases.val[0]),
-                        vaddq_f16(accum.val[1], biases.val[1])
-                    }
-                };
-
-                vst2q_f16(reinterpret_cast<float16_t *>(in0_out.ptr()), res);
-            },
-            in0_out, in1);
-            break;
-        }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
-    }
-}

diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 8ee46ea..2cac93a 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
index e3508a1..5bec9d3 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
deleted file mode 100644
index cf8411c..0000000
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ /dev/null

@@ -1,410 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    if(is_data_type_quantized_asymmetric(input0->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    }
-    else
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(DataLayoutDimension::HEIGHT) != output->dimension(DataLayoutDimension::HEIGHT));
-    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(DataLayoutDimension::WIDTH) != output->dimension(DataLayoutDimension::WIDTH));
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
-{
-    const unsigned int num_elems_read_per_iteration = 16 / input0->element_size();
-
-    Window win = calculate_max_window(*input0, Steps(num_elems_read_per_iteration));
-
-    AccessWindowHorizontal input0_access(input0, 0, num_elems_read_per_iteration);
-    AccessWindowHorizontal input1_access(input1, 0, num_elems_read_per_iteration);
-    AccessWindowStatic     output_access(output, 0, 0, output->dimension(0), output->dimension(1));
-
-    bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
-
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-template <typename I0, typename I1, typename O>
-void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out)
-{
-    ARM_COMPUTE_ERROR("Unsupported data types");
-    ARM_COMPUTE_UNUSED(window_in);
-    ARM_COMPUTE_UNUSED(window_w);
-    ARM_COMPUTE_UNUSED(window_out);
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>(const Window &window_in,
-                                                                                const Window &window_w,
-                                                                                const Window &window_out)
-{
-    Iterator in(_input0, window_in);
-    Iterator in2(_input1, window_w);
-    Iterator out(_output, window_out);
-
-    const int input_w          = _input0->info()->dimension(0);
-    const int input_h          = _input0->info()->dimension(1);
-    const int input_stride_x   = _input0->info()->strides_in_bytes().x();
-    const int weights_stride_x = _input1->info()->strides_in_bytes().x();
-    const int weights_stride_y = _input1->info()->strides_in_bytes().y();
-    const int output_stride_x  = _output->info()->strides_in_bytes().x();
-
-    execute_window_loop(window_in, [&](const Coordinates & id)
-    {
-        // Get pointers
-        const uint8_t *const input_ptr   = in.ptr();
-        const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
-        auto                 output_ptr  = reinterpret_cast<__fp16 *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
-
-        float16x8_t row_dot = vdupq_n_f16(0.f);
-        for(int i = 0; i < input_w; i += 8)
-        {
-            const auto input   = vld1q_f16(reinterpret_cast<const __fp16 *>(input_ptr + i * input_stride_x));
-            const auto weights = vld1q_f16(reinterpret_cast<const __fp16 *>(weights_ptr + i * weights_stride_x));
-            row_dot            = vaddq_f16(row_dot, vmulq_f16(input, weights));
-        }
-
-        auto temp = vadd_f16(vget_high_f16(row_dot), vget_low_f16(row_dot));
-        temp      = vpadd_f16(temp, temp);
-        temp      = vpadd_f16(temp, temp);
-
-        *output_ptr = vget_lane_f16(temp, 0);
-    },
-    in, in2, out);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-template <>
-void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>(const Window &window_in,
-                                                                                   const Window &window_w,
-                                                                                   const Window &window_out)
-{
-    Iterator in(_input0, window_in);
-    Iterator in2(_input1, window_w);
-    Iterator out(_output, window_out);
-
-    const int input_w          = _input0->info()->dimension(0);
-    const int input_h          = _input0->info()->dimension(1);
-    const int input_stride_x   = _input0->info()->strides_in_bytes().x();
-    const int weights_stride_x = _input1->info()->strides_in_bytes().x();
-    const int weights_stride_y = _input1->info()->strides_in_bytes().y();
-    const int output_stride_x  = _output->info()->strides_in_bytes().x();
-
-    execute_window_loop(window_in, [&](const Coordinates & id)
-    {
-        // Get pointers
-        const uint8_t *const input_ptr   = in.ptr();
-        const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
-        auto                 output_ptr  = reinterpret_cast<float *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
-
-        float32x4_t row_dot = vdupq_n_f32(0.f);
-        for(int i = 0; i < input_w; i += 4)
-        {
-            const auto input   = vld1q_f32(reinterpret_cast<const float *>(input_ptr + i * input_stride_x));
-            const auto weights = vld1q_f32(reinterpret_cast<const float *>(weights_ptr + i * weights_stride_x));
-            row_dot            = vaddq_f32(row_dot, vmulq_f32(input, weights));
-        }
-
-        auto temp = vadd_f32(vget_high_f32(row_dot), vget_low_f32(row_dot));
-        temp      = vpadd_f32(temp, temp);
-
-        *output_ptr = vget_lane_f32(temp, 0);
-    },
-    in, in2, out);
-}
-
-template <>
-void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>(const Window &window_in,
-                                                                                         const Window &window_w,
-                                                                                         const Window &window_out)
-{
-    Iterator in(_input0, window_in);
-    Iterator in2(_input1, window_w);
-    Iterator out(_output, window_out);
-
-    const int input_offset   = -_input0->info()->quantization_info().uniform().offset;
-    const int weights_offset = -_input1->info()->quantization_info().uniform().offset;
-
-    const int input_w          = _input0->info()->dimension(0);
-    const int input_h          = _input0->info()->dimension(1);
-    const int input_stride_x   = _input0->info()->strides_in_bytes().x();
-    const int weights_stride_x = _input1->info()->strides_in_bytes().x();
-    const int weights_stride_y = _input1->info()->strides_in_bytes().y();
-    const int output_stride_x  = _output->info()->strides_in_bytes().x();
-    const int read_step        = 16 / _input0->info()->element_size();
-
-    const int32x4_t v_input_offset   = vdupq_n_s32(input_offset);
-    const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
-
-    execute_window_loop(window_in, [&](const Coordinates & id)
-    {
-        // Get pointers
-        const uint8_t *const input_ptr   = in.ptr();
-        const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
-        auto                 output_ptr  = reinterpret_cast<int32_t *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
-
-        int32x4_t row_dot = vdupq_n_s32(0);
-        for(int i = 0; i < input_w; i += read_step)
-        {
-            // Read values
-            const auto input   = vld1q_u8(reinterpret_cast<const uint8_t *>(input_ptr + i * input_stride_x));
-            const auto weights = vld1q_u8(reinterpret_cast<const uint8_t *>(weights_ptr + i * weights_stride_x));
-
-            // Add offsets
-            const int32x4x4_t input_s32 =
-            {
-                {
-                    vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(input))))),
-                    vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(input))))),
-                    vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(input))))),
-                    vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(input)))))
-                }
-            };
-            const int32x4x4_t weights_s32 =
-            {
-                {
-                    vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(weights))))),
-                    vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(weights))))),
-                    vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(weights))))),
-                    vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(weights)))))
-                }
-            };
-
-            // Dot
-            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[0], weights_s32.val[0]));
-            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[1], weights_s32.val[1]));
-            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[2], weights_s32.val[2]));
-            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[3], weights_s32.val[3]));
-        }
-
-        // Reduction
-        auto temp = vadd_s32(vget_high_s32(row_dot), vget_low_s32(row_dot));
-        temp      = vpadd_s32(temp, temp);
-
-        *output_ptr = vget_lane_s32(temp, 0);
-    },
-    in, in2, out);
-}
-
-template <>
-void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<int8_t, int8_t, int32_t>(const Window &window_in,
-                                                                                       const Window &window_w,
-                                                                                       const Window &window_out)
-{
-    Iterator in(_input0, window_in);
-    Iterator in2(_input1, window_w);
-    Iterator out(_output, window_out);
-
-    const int input_offset   = -_input0->info()->quantization_info().uniform().offset;
-    const int weights_offset = -_input1->info()->quantization_info().uniform().offset;
-
-    const int input_w          = _input0->info()->dimension(0);
-    const int input_h          = _input0->info()->dimension(1);
-    const int input_stride_x   = _input0->info()->strides_in_bytes().x();
-    const int weights_stride_x = _input1->info()->strides_in_bytes().x();
-    const int weights_stride_y = _input1->info()->strides_in_bytes().y();
-    const int output_stride_x  = _output->info()->strides_in_bytes().x();
-    const int read_step        = 16 / _input0->info()->element_size();
-
-    const int32x4_t v_input_offset   = vdupq_n_s32(input_offset);
-    const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
-
-    execute_window_loop(window_in, [&](const Coordinates & id)
-    {
-        // Get pointers
-        const uint8_t *const input_ptr   = in.ptr();
-        const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
-        auto                 output_ptr  = reinterpret_cast<int32_t *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
-
-        int32x4_t row_dot = vdupq_n_s32(0);
-        for(int i = 0; i < input_w; i += read_step)
-        {
-            // Read values
-            const auto input   = vld1q_s8(reinterpret_cast<const int8_t *>(input_ptr + i * input_stride_x));
-            const auto weights = vld1q_s8(reinterpret_cast<const int8_t *>(weights_ptr + i * weights_stride_x));
-
-            // Add offsets
-            const int32x4x4_t input_s32 =
-            {
-                {
-                    vaddw_s16(v_input_offset, vget_low_s16(vmovl_s8(vget_low_s8(input)))),
-                    vaddw_s16(v_input_offset, vget_high_s16(vmovl_s8(vget_low_s8(input)))),
-                    vaddw_s16(v_input_offset, vget_low_s16(vmovl_s8(vget_high_s8(input)))),
-                    vaddw_s16(v_input_offset, vget_high_s16(vmovl_s8(vget_high_s8(input))))
-                }
-            };
-            const int32x4x4_t weights_s32 =
-            {
-                {
-                    vaddw_s16(v_weights_offset, vget_low_s16(vmovl_s8(vget_low_s8(weights)))),
-                    vaddw_s16(v_weights_offset, vget_high_s16(vmovl_s8(vget_low_s8(weights)))),
-                    vaddw_s16(v_weights_offset, vget_low_s16(vmovl_s8(vget_high_s8(weights)))),
-                    vaddw_s16(v_weights_offset, vget_high_s16(vmovl_s8(vget_high_s8(weights))))
-                }
-            };
-
-            // Dot
-            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[0], weights_s32.val[0]));
-            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[1], weights_s32.val[1]));
-            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[2], weights_s32.val[2]));
-            row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[3], weights_s32.val[3]));
-        }
-
-        // Reduction
-        auto temp = vadd_s32(vget_high_s32(row_dot), vget_low_s32(row_dot));
-        temp      = vpadd_s32(temp, temp);
-
-        *output_ptr = vget_lane_s32(temp, 0);
-    },
-    in, in2, out);
-}
-
-NEGEMMMatrixVectorMultiplyKernel::NEGEMMMatrixVectorMultiplyKernel()
-    : _func(nullptr), _input0(nullptr), _input1(nullptr), _output(nullptr), _border_size(0)
-{
-}
-
-BorderSize NEGEMMMatrixVectorMultiplyKernel::border_size() const
-{
-    return _border_size;
-}
-
-void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
-
-    _input0 = input0;
-    _input1 = input1;
-    _output = output;
-
-    // Set appropriate function to run
-    switch(input0->info()->data_type())
-    {
-        case DataType::QASYMM8:
-            _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>;
-            break;
-        case DataType::QASYMM8_SIGNED:
-            _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<int8_t, int8_t, int32_t>;
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>;
-            break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-            _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type");
-    }
-
-    // Configure kernel window
-    const unsigned int num_elems_read_per_iteration = 16 / _input0->info()->element_size();
-
-    const unsigned int border_x = ceil_to_multiple(input0->info()->dimension(0), num_elems_read_per_iteration) - input0->info()->dimension(0);
-    _border_size                = BorderSize(0, border_x);
-
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
-}
-
-Status NEGEMMMatrixVectorMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
-    return Status{};
-}
-
-void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    Window window_slice = window.first_slice_window_3D();
-
-    Window window_in(window);
-    Window window_weights(window_slice);
-    Window window_out(window);
-
-    // Setup input0 slice
-    window_in.set(Window::DimX, Window::Dimension(0, _input0->info()->dimension(0), _input0->info()->dimension(0)));
-    window_in.set(Window::DimY, Window::Dimension(0, _input0->info()->dimension(1), 1));
-    window_in.set(Window::DimZ, Window::Dimension(0, _input0->info()->dimension(2), 1));
-
-    // Setup input1 and output slice. Their dimensions are increased in the kernel.
-    window_weights.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_weights.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_weights.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    (this->*_func)(window_in, window_weights, window_out);
-}
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 88104f7..951cb19 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,16 @@
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
 #include <arm_neon.h>
-#include <cstddef>
-#include <cstring>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 TensorShape get_output_shape(const ITensorInfo *input)
@@ -57,7 +50,6 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     if(output->total_size() != 0)
     {
@@ -192,3 +184,4 @@
         }
     }
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
index 0a7a8df..906e8a0 100644
--- a/src/core/NEON/kernels/NEGatherKernel.cpp
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
index f412980..18dd80e 100644
--- a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
index 0e4549e..99b5d4b 100644
--- a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
index 13cee19..83d2877 100644
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
+++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 940ccab..c3b1059 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
index c58b1c0..84bb59e 100644
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
index 8f52399..eb0d450 100644
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
index 34e68e7..340c694 100644
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
index 2f130d9..8a671bf 100644
--- a/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEHeightConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,24 +58,23 @@
 } // namespace
 
 NEHeightConcatenateLayerKernel::NEHeightConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _height_offset(0)
+    : _height_offset(0)
 {
 }
 
-void NEHeightConcatenateLayerKernel::configure(const ITensor *input, unsigned int height_offset, ITensor *output)
+void NEHeightConcatenateLayerKernel::configure(const ITensorInfo *input, unsigned int height_offset, ITensorInfo *output)
 {
+    ARM_COMPUTE_UNUSED(input);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), height_offset, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, height_offset, output));
 
-    _input         = input;
-    _output        = output;
     _height_offset = height_offset;
 
     // Configure kernel window
-    Window      win = calculate_max_window(*output->info(), Steps());
+    Window      win = calculate_max_window(*output, Steps());
     Coordinates coord;
-    coord.set_num_dimensions(output->info()->num_dimensions());
-    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
     INEKernel::configure(win);
 }
 
@@ -85,30 +84,33 @@
     return Status{};
 }
 
-void NEHeightConcatenateLayerKernel::run(const Window &window, const ThreadInfo &info)
+void NEHeightConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
     // Offset output pointer to the correct position
-    uint8_t *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + _height_offset * _output->info()->strides_in_bytes()[Window::DimY];
+    uint8_t *output_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
 
     const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(_output->info()->element_size()); 
-    const int window_step_x = 16;
+    const auto window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
+    const int  window_step_x  = 16;
 
     Window win{ window };
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    win.set(Window::DimY, Window::Dimension(0, _input->info()->tensor_shape().y(), 1));
+    win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1));
 
     // Create iterators
-    Iterator input(_input, win);
-    Iterator output(_output, win);
+    Iterator input(src, win);
+    Iterator output(dst, win);
 
-    const DataType                 dt           = _input->info()->data_type();
-    const UniformQuantizationInfo &input_qinfo  = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
+    const DataType                 dt           = src->info()->data_type();
+    const UniformQuantizationInfo &input_qinfo  = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &output_qinfo = dst->info()->quantization_info().uniform();
     if(dt == DataType::QASYMM8 && input_qinfo != output_qinfo)
     {
         execute_window_loop(win, [&](const Coordinates &)

diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp
index 211ea1f..0f8397f 100644
--- a/src/core/NEON/kernels/NEHistogramKernel.cpp
+++ b/src/core/NEON/kernels/NEHistogramKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 8a3f7cd..1a2b95e 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index 7fc9361..f650d97 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -40,7 +41,43 @@
 {
 namespace
 {
-template <typename T>
+template <typename InputType, typename AccType = InputType>
+void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs)
+{
+    result        = wrapper::vadd(result, inputs);
+    result_square = wrapper::vadd(result_square, wrapper::vmul(inputs, inputs));
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline void vector_float_sum(float32x4_t &result, float32x4_t &result_square, const float16x8_t &inputs)
+{
+    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgetlow(inputs)));
+    vector_float_sum(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename InputType, typename AccType = InputType>
+InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+{
+    return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <>
+inline float16x8_t vector_float_norm(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta)
+{
+    const auto  input_low   = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
+    const auto  input_high  = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
+    const auto  result_low  = wrapper::vcvt<float16_t>(vector_float_norm(input_low, vec_mean, vec_multip, vec_beta));
+    const auto  result_high = wrapper::vcvt<float16_t>(vector_float_norm(input_high, vec_mean, vec_multip, vec_beta));
+    float16x8_t result      = wrapper::vcombine(result_low, result_high);
+
+    return result;
+}
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+template <typename T, typename AccType = T>
 void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
 {
     /** NEON vector tag type. */
@@ -65,39 +102,37 @@
         Iterator input_plane_it(input, win_plane);
         Iterator output_plane_it(output, win_plane);
 
-        auto sum_h_w         = static_cast<T>(0.f);
-        auto sum_squares_h_w = static_cast<T>(0.f);
+        auto sum_h_w         = static_cast<AccType>(0.f);
+        auto sum_squares_h_w = static_cast<AccType>(0.f);
 
         execute_window_loop(win_plane, [&](const Coordinates &)
         {
             const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
 
-            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
 
             // Compute S elements per iteration
             int x = window.x().start();
             for(; x <= (window.x().end() - window_step_x); x += window_step_x)
             {
-                auto vec_input_val  = wrapper::vloadq(input_ptr + x);
-                vec_sum_h_w         = wrapper::vadd(vec_sum_h_w, vec_input_val);
-                vec_sum_squares_h_w = wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
+                auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
             }
 
             auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
             auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
-            for(int i = 0; i < window_step_x / 4; ++i)
-            {
-                vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-                vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
-            }
+
+            vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
             sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
             sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
 
             // Compute left-over elements
             for(; x < window.x().end(); ++x)
             {
-                const auto value = *(input_ptr + x);
+                const auto value = static_cast<AccType>(*(input_ptr + x));
                 sum_h_w += value;
                 sum_squares_h_w += value * value;
             }
@@ -108,9 +143,9 @@
         const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
 
         const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
-        const auto vec_beta       = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});
+        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+        const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
 
         execute_window_loop(win_plane, [&](const Coordinates &)
         {
@@ -118,19 +153,20 @@
             auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
 
             // Compute S elements per iteration
-            int  x       = window.x().start();
-            auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+            int x = window.x().start();
+            //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
             for(; x <= (window.x().end() - window_step_x); x += window_step_x)
             {
-                vec_val = wrapper::vloadq(input_ptr + x);
-                vec_val = wrapper::vadd(wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
-                wrapper::vstore(output_ptr + x, vec_val);
+                const auto vec_val        = wrapper::vloadq(input_ptr + x);
+                const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+                wrapper::vstore(output_ptr + x, normalized_vec);
             }
 
             // Compute left-over elements
             for(; x < window.x().end(); ++x)
             {
-                *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta;
+                const auto val    = static_cast<AccType>(*(input_ptr + x));
+                *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
             }
         },
         input_plane_it, output_plane_it);
@@ -179,17 +215,18 @@
 {
 }
 
-void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon)
+void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
-    _input   = input;
-    _output  = output == nullptr ? input : output;
-    _gamma   = gamma;
-    _beta    = beta;
-    _epsilon = epsilon;
+    _input               = input;
+    _output              = output == nullptr ? input : output;
+    _gamma               = info.gamma;
+    _beta                = info.beta;
+    _epsilon             = info.epsilon;
+    _use_mixed_precision = info.use_mixed_precision;
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), gamma, beta, epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), _gamma, _beta, _epsilon));
 
     if(_input->info()->data_type() == DataType::F32)
     {
@@ -198,7 +235,14 @@
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     else if(_input->info()->data_type() == DataType::F16)
     {
-        _func = &instance_normalization_nchw<float16_t>;
+        if(_use_mixed_precision)
+        {
+            _func = &instance_normalization_nchw<float16_t, float>;
+        }
+        else
+        {
+            _func = &instance_normalization_nchw<float16_t>;
+        }
     }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     else
@@ -213,9 +257,9 @@
     INEKernel::configure(std::get<1>(win_config));
 }
 
-Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info.gamma, info.beta, info.epsilon));
     ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
     return Status{};
 }

diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
index b6db5f0..58ee3b4 100644
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 9900446..dbcfda2 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,102 +45,87 @@
 template <typename T, int S>
 void l2_normalize_X(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
 {
-    /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
-    Window window_sum(window);
-    window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
 
-    Window in_slice  = window.first_slice_window_1D();
-    Window sum_slice = window_sum.first_slice_window_1D();
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    do
+    Iterator input_it(in, win_collapsed);
+    Iterator sum_it(sum, win_collapsed);
+    Iterator output_it(out, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
-        Iterator input_it(in, in_slice);
-        Iterator sum_it(sum, sum_slice);
-        Iterator output_it(out, in_slice);
+        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
 
-        const auto sum_value           = *reinterpret_cast<const T *>(sum_it.ptr());
-        const auto vec_normalize_value = wrapper::vdup_n(static_cast<T>(1.f / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)))), ExactTagType{});
+        const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
+        const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
+        const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
 
-        execute_window_loop(in_slice, [&](const Coordinates &)
+        // Compute elements over vector steps
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+        }
 
-            wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
-        },
-        input_it, output_it);
-    }
-    while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            out_ptr[x] = in_ptr[x] * norm_value;
+        }
+    },
+    input_it, sum_it, output_it);
 }
 
 template <typename T, int S>
-void l2_normalize_Y(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
+void l2_normalize_YZ(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 {
-    /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
-    Window window_sum(window);
-    window_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+    const int  window_step_x  = 16 / data_size_from_type(in->info()->data_type());
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
 
-    Window in_slice  = window.first_slice_window_2D();
-    Window sum_slice = window_sum.first_slice_window_2D();
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    do
+    Window window_sum(win);
+    window_sum.set(axis, Window::Dimension(0, 0, 0));
+
+    Iterator input_it(in, win);
+    Iterator sum_it(sum, window_sum);
+    Iterator output_it(out, win);
+
+    const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
+
+    execute_window_loop(win, [&](const Coordinates &)
     {
-        Iterator input_it(in, in_slice);
-        Iterator sum_it(sum, sum_slice);
-        Iterator output_it(out, in_slice);
+        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+        const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
 
-        auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
-
-        execute_window_loop(in_slice, [&](const Coordinates &)
+        // Compute elements over vector steps
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-            const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
-            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+            const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
+            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+        }
 
-            const auto vec_normalize_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr), eps));
-            wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
-        },
-        input_it, sum_it, output_it);
-    }
-    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
-}
-
-template <typename T, int S>
-void l2_normalize_Z(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
-{
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    Window window_sum(window);
-    window_sum.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-    Window in_slice  = window.first_slice_window_3D();
-    Window sum_slice = window_sum.first_slice_window_3D();
-
-    do
-    {
-        Iterator input_it(in, in_slice);
-        Iterator sum_it(sum, sum_slice);
-        Iterator output_it(out, in_slice);
-
-        auto eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
-
-        execute_window_loop(in_slice, [&](const Coordinates &)
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
         {
-            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-            const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
-            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-            const auto vec_normalize_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr), eps));
-            wrapper::vstore(out_ptr, wrapper::vmul(wrapper::vloadq(in_ptr), vec_normalize_value));
-        },
-        input_it, sum_it, output_it);
-    }
-    while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+            const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
+            out_ptr[x]         = in_ptr[x] * norm_value;
+        }
+    },
+    input_it, sum_it, output_it);
 }
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
@@ -170,27 +155,19 @@
     return Status{};
 }
 
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *sum, ITensorInfo *output, int axis)
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
-    const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    const unsigned int num_elems_processed_per_iteration     = 16 / data_size_from_type(input->data_type());
-    const unsigned int num_elems_processed_per_iteration_sum = (actual_axis == 0) ? 1 : num_elems_processed_per_iteration;
-
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*input, Steps());
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
 
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal sum_access(sum, 0, num_elems_processed_per_iteration_sum);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    // NEL2NormalizeLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
 
-    bool window_changed = update_window_and_padding(win, input_access, sum_access, output_access);
-    output_access.set_valid_region(win, input->valid_region());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-
-    return std::make_tuple(err, win);
+    return std::make_tuple(Status{}, win);
 }
 } // namespace
 
@@ -204,14 +181,14 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
 
-    _input   = input;
-    _sum     = sum;
-    _output  = output;
-    _actual_axis    = wrap_around(axis, max_input_tensor_dim);
-    _epsilon = epsilon;
+    _input       = input;
+    _sum         = sum;
+    _output      = output;
+    _actual_axis = wrap_around(axis, max_input_tensor_dim);
+    _epsilon     = epsilon;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(_input->info(), _sum->info(), _output->info(), axis);
+    auto win_config = validate_and_configure_window(_input->info(), _output->info());
     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
 
     INEKernel::configure(std::get<1>(win_config));
@@ -220,7 +197,7 @@
 Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), sum->clone().get(), output->clone().get(), axis)));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
     return Status{};
 }
@@ -231,55 +208,23 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_actual_axis)
+    if(_actual_axis > 2)
     {
-        case 0:
-            switch(_input->info()->data_type())
-            {
-                case DataType::F32:
-                    l2_normalize_X<float, 4>(_input, _sum, _output, _epsilon, window);
-                    break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    l2_normalize_X<float16_t, 8>(_input, _sum, _output, _epsilon, window);
-                    break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                default:
-                    ARM_COMPUTE_ERROR("Not implemented");
-            }
+        ARM_COMPUTE_ERROR("Unsupported normalization axis");
+    }
+
+    switch(_input->info()->data_type())
+    {
+        case DataType::F32:
+            (_actual_axis == Window::DimX) ? l2_normalize_X<float, 4>(_input, _sum, _output, _epsilon, window) : l2_normalize_YZ<float, 4>(_input, _sum, _output, _epsilon, window, _actual_axis);
             break;
-        case 1:
-            switch(_input->info()->data_type())
-            {
-                case DataType::F32:
-                    l2_normalize_Y<float, 4>(_input, _sum, _output, _epsilon, window);
-                    break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    l2_normalize_Y<float16_t, 8>(_input, _sum, _output, _epsilon, window);
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not implemented");
-            }
+        case DataType::F16:
+            (_actual_axis == Window::DimX) ? l2_normalize_X<float16_t, 8>(_input, _sum, _output, _epsilon, window) : l2_normalize_YZ<float16_t, 8>(_input, _sum, _output, _epsilon, window, _actual_axis);
             break;
-        case 2:
-            switch(_input->info()->data_type())
-            {
-                case DataType::F32:
-                    l2_normalize_Z<float, 4>(_input, _sum, _output, _epsilon, window);
-                    break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                case DataType::F16:
-                    l2_normalize_Z<float16_t, 8>(_input, _sum, _output, _epsilon, window);
-                    break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                default:
-                    ARM_COMPUTE_ERROR("Not implemented");
-            }
-            break;
         default:
-            ARM_COMPUTE_ERROR("Unsupported normalization axis");
+            ARM_COMPUTE_ERROR("Not implemented");
     }
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp
index ddf869e..533c241 100644
--- a/src/core/NEON/kernels/NELKTrackerKernel.cpp
+++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
index 467546a..dd2824b 100644
--- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
index 8c09898..a0c1dbc 100644
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp
new file mode 100644
index 0000000..821bf53
--- /dev/null
+++ b/src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.cpp

@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "support/ToolchainSupport.h"
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
+
+    int                 pool_stride_x   = 0;
+    int                 pool_stride_y   = 0;
+    PoolingType         pool_type       = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int    pool_size_x = pool_info.pool_size.width;
+    const int    pool_size_y = pool_info.pool_size.height;
+    const Size2D pool_size(pool_size_x, pool_size_y);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+NEMaxUnpoolingLayerKernel::NEMaxUnpoolingLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _indices(nullptr)
+{
+}
+
+void NEMaxUnpoolingLayerKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
+
+    _input   = input;
+    _output  = output;
+    _indices = indices;
+
+    switch(input->info()->data_type())
+    {
+        case DataType::F32:
+            _func = &NEMaxUnpoolingLayerKernel::unpooling2<float>;
+            break;
+        case DataType::QASYMM8:
+            _func = &NEMaxUnpoolingLayerKernel::unpooling2<uint8_t>;
+            break;
+        case DataType::QASYMM8_SIGNED:
+            _func = &NEMaxUnpoolingLayerKernel::unpooling2<int8_t>;
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = &NEMaxUnpoolingLayerKernel::unpooling2<float16_t>;
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        default:
+            break;
+    }
+    const TensorShape output_shape = compute_unpool_shape(*input->info(), pool_info);
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
+
+    auto window = calculate_max_window(*input->info(), Steps());
+    INEKernel::configure(window);
+}
+template <typename T>
+void NEMaxUnpoolingLayerKernel::unpooling2(const Window &window)
+{
+    Iterator  input(_input, window);
+    Iterator  indices(_indices, window);
+    auto      out_ptr      = reinterpret_cast<T *>(_output->buffer());
+    const int out_stride_w = static_cast<int>(_output->info()->strides_in_bytes()[3]);
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto vindices                                         = reinterpret_cast<uint32_t *>(indices.ptr());
+        auto vinput                                           = reinterpret_cast<T *>(input.ptr());
+        out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
+    },
+    input, indices);
+}
+
+Status NEMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
+    return Status{};
+}
+
+void NEMaxUnpoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+    // Run function
+    (this->*_func)(window);
+}
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
index afab6d6..914a21c 100644
--- a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 57380b4..3fa4480 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
index 9dc1bc9..72225a4 100644
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEMemsetKernel.cpp b/src/core/NEON/kernels/NEMemsetKernel.cpp
index a0fab99..3870fa5 100644
--- a/src/core/NEON/kernels/NEMemsetKernel.cpp
+++ b/src/core/NEON/kernels/NEMemsetKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
index f69c883..b1c2b1c 100644
--- a/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
index 7fa2dc1..e956f9a 100644
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
index 00536f0..f20e869 100644
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
index 674a7c8..3e4c6e2 100644
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index e5f6e4f..6cd0780 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
@@ -60,58 +60,13 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *input_squared, ITensorInfo *output, const NormalizationLayerInfo &norm_info)
-{
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, *input->clone());
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    const unsigned int norm_idx              = get_normalization_dimension_index(input->data_layout(), norm_info);
-    const bool         is_norm_accross_width = norm_idx == 0;
-
-    const unsigned int border_width = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
-    const BorderSize   border_size  = BorderSize(0, border_width);
-
-    // Configure window
-    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    bool   window_changed = false;
-
-    if(is_norm_accross_width)
-    {
-        AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
-        AccessWindowStatic input_squared_access(input_squared, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
-        window_changed = window_changed || update_window_and_padding(win, input_access, input_squared_access);
-    }
-    else
-    {
-        AccessWindowHorizontal input_access(input, -border_size.left, num_elems_processed_per_iteration);
-        AccessWindowHorizontal input_squared_access(input_squared, -border_size.left, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, input_access, input_squared_access);
-    }
-
-    if(output->total_size() != 0)
-    {
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, output_access);
-        output_access.set_valid_region(win, input->valid_region());
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
 } // namespace
 
 NENormalizationLayerKernel::NENormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP_1D), _border_size()
+    : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP_1D)
 {
 }
 
-BorderSize NENormalizationLayerKernel::border_size() const
-{
-    return _border_size;
-}
-
 void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
@@ -121,17 +76,12 @@
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), input_squared->info(), output->info(), norm_info));
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-
-    const unsigned int norm_idx              = get_normalization_dimension_index(input->info()->data_layout(), norm_info);
-    const bool         is_norm_accross_width = norm_idx == 0;
-    const unsigned int border_width          = is_norm_accross_width ? num_elems_processed_per_iteration - 1 : 0;
+    const unsigned int norm_idx = get_normalization_dimension_index(input->info()->data_layout(), norm_info);
 
     _input         = input;
     _input_squared = input_squared;
     _output        = output;
     _norm_info     = norm_info;
-    _border_size   = BorderSize(0, border_width);
 
     switch(_input->info()->data_type())
     {
@@ -210,9 +160,11 @@
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), input_squared->info(), output->info(), norm_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    Window      win = calculate_max_window(*input->info(), Steps());
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    INEKernel::configure(win);
 }
 
 template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
@@ -221,15 +173,23 @@
     /** NEON vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
-    Iterator input(_input, window);
-    Iterator input_squared(_input_squared, window);
-    Iterator output(_output, window);
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    const int dim_y                = _input->info()->data_layout() == DataLayout::NCHW ? 1 : 2;
-    const int radius               = _norm_info.norm_size() / 2;
-    const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim];
-    // We account padding across X only and we iterate over rows
-    const int min_left   = (dim == 2) ? 0 : -static_cast<int>(border_size().left);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = S;
+
+    Iterator input(_input, win);
+    Iterator input_squared(_input_squared, win);
+    Iterator output(_output, win);
+
+    const int dim_y                      = _input->info()->data_layout() == DataLayout::NCHW ? 1 : 2;
+    const int radius                     = _norm_info.norm_size() / 2;
+    const int input_squared_stride_x     = _input_squared->info()->strides_in_bytes()[0];
+    const int input_squared_stride_slice = _input_squared->info()->strides_in_bytes()[dim];
+    const int input_squared_stride_row   = _input_squared->info()->strides_in_bytes()[dim_y];
+
     const int max_right  = _input->info()->dimension(dim) - 1;
     const int max_bottom = _input->info()->dimension(dim_y) - 1;
 
@@ -237,33 +197,80 @@
     const auto beta_vec  = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{});
     const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{});
 
-    execute_window_loop(window, [&](const Coordinates & id)
+    auto sequential_normalization = [&](const int x, const Coordinates & id, const int current_row, const int first_row, const int last_row, const T * input_ptr, const uint8_t *input_squared_start_ptr,
+                                        T * output_ptr)
     {
-        // Get range to normalize
-        const int current_row   = do_2D_norm ? id[dim_y] : 0;
-        const int current_slice = id[dim];
-        const int first_row     = do_2D_norm ? std::max(current_row - radius, 0) : 0;
-        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-        const int first_slice   = std::max(current_slice - radius, min_left);
+        const int current_slice = dim == 0 ? x : id[dim];
+        const int first_slice   = std::max(current_slice - radius, 0);
         const int last_slice    = std::min(current_slice + radius, max_right);
 
+        const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
         // Accumulate 2D In-Map values
-        auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-        for(int j = first_row; j <= last_row; j++)
+        auto accu = static_cast<T>(0.f);
+        for(int j = first_row; j <= last_row; ++j)
         {
             // Compute row displacement
-            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+            const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
             for(int i = first_slice; i <= last_slice; ++i)
             {
-                accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + i * input_squared_stride)));
+                accu += *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
             }
         }
 
         // Normalize
-        const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
-        const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(reinterpret_cast<const T *>(input.ptr())), wrapper::vinv(normalized));
-        wrapper::vstore(reinterpret_cast<T *>(output.ptr()), normalized_pixel);
+        const auto normalized       = std::pow(accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
+        const auto normalized_pixel = (*(input_ptr + x)) / normalized;
+        *(output_ptr + x)           = normalized_pixel;
+    };
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        auto       output_ptr = reinterpret_cast<T *>(output.ptr());
+
+        // Get range to normalize
+        const int current_row = do_2D_norm ? id[dim_y] : 0;
+        const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
+        const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+
+        int x = window_start_x;
+        // Compute serially starting elements for the case x dimension is width
+        for(; x < radius && x < window_end_x && dim == 0; ++x)
+        {
+            sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
+        }
+
+        // Compute vectorized
+        for(; x <= window_end_x - window_step_x - radius; x += window_step_x)
+        {
+            const int current_slice = dim == 0 ? x : id[dim];
+            const int first_slice   = std::max(current_slice - radius, 0);
+            const int last_slice    = std::min(current_slice + radius, max_right);
+
+            const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
+            // Accumulate 2D In-Map values
+            auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            for(int j = first_row; j <= last_row; ++j)
+            {
+                // Compute row displacement
+                const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
+                for(int i = first_slice; i <= last_slice; ++i)
+                {
+                    accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+                }
+            }
+
+            // Normalize
+            const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
+            const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
+            wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
+        }
     },
     input, input_squared, output);
 }
@@ -271,7 +278,6 @@
 Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), input_squared->clone().get(), output->clone().get(), norm_info).first);
 
     return Status{};
 }
@@ -286,3 +292,4 @@
     // Run function
     (this->*_func)(window);
 }
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 1a38b86..d840bb7 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp
index 2c0db76..737b10b 100644
--- a/src/core/NEON/kernels/NEPermuteKernel.cpp
+++ b/src/core/NEON/kernels/NEPermuteKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
index ca59e66..907a7f1 100644
--- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp
+++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,8 +43,6 @@
 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
 const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
 
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
 inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     ARM_COMPUTE_UNUSED(overflow_policy);
@@ -64,17 +62,18 @@
 
     if(output->total_size() > 0)
     {
-        if(is_data_type_quantized(output->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
-        }
-
         const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
                                         "Output can only be U8 if both inputs are U8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
+                                        "Output can only be QASYMM8 if both inputs are QASYMM8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),
+                                        "Output can only be QASYMM8_SIGNED if both inputs are QASYMM8_SIGNED");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
+                                        "Output can only be QSYMM16 if both inputs are QSYMM16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
                                         "Output can only be S32 if both inputs are QSYMM16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");
@@ -100,60 +99,6 @@
     return Status{};
 }
 
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
-    const ValidRegion &valid_region = broadcast_pair.second;
-
-    // Auto initialize output if not initialized
-    {
-        ARM_COMPUTE_UNUSED(set_shape_if_empty(*output, input1->tensor_shape()));
-
-        if(input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16)
-        {
-            set_format_if_unknown(*output, Format::S16);
-        }
-        else if(input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32)
-        {
-            set_format_if_unknown(*output, Format::F32);
-        }
-        else if(input1->data_type() == DataType::F16 || input2->data_type() == DataType::F16)
-        {
-            set_format_if_unknown(*output, Format::F16);
-        }
-        else if(input1->data_type() == DataType::QASYMM8 || input2->data_type() == DataType::QASYMM8)
-        {
-            set_data_type_if_unknown(*output, DataType::QASYMM8);
-        }
-        else if(input1->data_type() == DataType::QASYMM8_SIGNED || input2->data_type() == DataType::QASYMM8_SIGNED)
-        {
-            set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED);
-        }
-        else if(input1->data_type() == DataType::QSYMM16 || input2->data_type() == DataType::QSYMM16)
-        {
-            set_data_type_if_unknown(*output, DataType::QSYMM16);
-        }
-    }
-
-    // Configure kernel window
-    Window win        = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-    Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
-    Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
-
-    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
 /* Scales a given vector by 1/255.
  *
  * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
@@ -178,224 +123,438 @@
     return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
 }
 
-inline void mul_saturate_QASYMM8_QASYMM8_QASYMM8_n_opt(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr,
-                                                       float32x4_t input1_vscale, int32x4_t input1_voffset, float32x4_t input2_vscale, int32x4_t input2_voffset, float32x4_t output_voffset, float32x4_t vinvscale)
+template <typename T>
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
+vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
 {
-    const auto input1 = static_cast<const qasymm8_t *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const qasymm8_t *__restrict>(input2_ptr);
-    const auto output = static_cast<qasymm8_t *__restrict>(output_ptr);
-
-    const qasymm8x16_t input1_q = vld1q_u8(input1);
-    const qasymm8x16_t input2_q = vld1q_u8(input2);
-
-    // Dequantitize inputs
-    float32x4x4_t in1_f32x4x4;
-    float32x4x4_t in2_f32x4x4;
-    in1_f32x4x4.val[0] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(input1_q))))), input1_voffset)), input1_vscale);
-    in1_f32x4x4.val[1] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(input1_q))))), input1_voffset)), input1_vscale);
-    in1_f32x4x4.val[2] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(input1_q))))), input1_voffset)), input1_vscale);
-    in1_f32x4x4.val[3] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(input1_q))))), input1_voffset)), input1_vscale);
-
-    in2_f32x4x4.val[0] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(input2_q))))), input2_voffset)), input2_vscale);
-    in2_f32x4x4.val[1] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(input2_q))))), input2_voffset)), input2_vscale);
-    in2_f32x4x4.val[2] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(input2_q))))), input2_voffset)), input2_vscale);
-    in2_f32x4x4.val[3] = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(input2_q))))), input2_voffset)), input2_vscale);
-
-    float32x4x4_t out_f32x4x4;
-    out_f32x4x4.val[0] = vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]);
-    out_f32x4x4.val[1] = vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]);
-    out_f32x4x4.val[2] = vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]);
-    out_f32x4x4.val[3] = vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]);
-
-    int32x4x4_t rf;
-#ifdef __aarch64__
-    rf.val[0] = vcvtnq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[0], vinvscale));
-    rf.val[1] = vcvtnq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[1], vinvscale));
-    rf.val[2] = vcvtnq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[2], vinvscale));
-    rf.val[3] = vcvtnq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[3], vinvscale));
-#else  //__aarch64__
-    rf.val[0] = vcvtq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[0], vinvscale));
-    rf.val[1] = vcvtq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[1], vinvscale));
-    rf.val[2] = vcvtq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[2], vinvscale));
-    rf.val[3] = vcvtq_s32_f32(vmlaq_f32(output_voffset, out_f32x4x4.val[3], vinvscale));
-#endif //__aarch64__
-    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-
-    vst1q_u8(output, vcombine_u8(pa, pb));
+    return vquantize_signed(val, info);
 }
 
-inline void mul_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED_n(
-    const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr,
-    float scale, const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info,
-    const UniformQuantizationInfo &output_qua_info)
-
+template <typename T>
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
+vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
 {
-    const auto                input1   = static_cast<const qasymm8_signed_t *__restrict>(input1_ptr);
-    const auto                input2   = static_cast<const qasymm8_signed_t *__restrict>(input2_ptr);
-    const auto                output   = static_cast<qasymm8_signed_t *__restrict>(output_ptr);
-    const qasymm8x16_signed_t input1_q = vld1q_s8(input1);
-    const qasymm8x16_signed_t input2_q = vld1q_s8(input2);
-    // Dequantitize inputs
-    const float32x4x4_t           in1_f32x4x4  = vdequantize(input1_q, input1_qua_info);
-    const float32x4x4_t           in2_f32x4x4  = vdequantize(input2_q, input2_qua_info);
-    const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
-    const float32x4x4_t           out_f32x4x4 =
-    {
-        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-    };
-    const int8x16_t result = vquantize_signed(out_f32x4x4, tmp_qua_info);
-    vst1q_s8(output, result);
+    return vquantize(val, info);
 }
 
-void mul_saturate_QSYMM16_QSYMM16_QSYMM16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale,
-                                            const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info, const UniformQuantizationInfo &output_qua_info)
+template <typename T>
+void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
 {
-    const auto input1 = static_cast<const qsymm16_t *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const qsymm16_t *__restrict>(input2_ptr);
-    const auto output = static_cast<qsymm16_t *__restrict>(output_ptr);
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
 
-    const qsymm16x8x2_t input1_q =
-    {
-        {
-            vld1q_s16(input1),
-            vld1q_s16(input1 + 8),
-        }
-    };
-    const qsymm16x8x2_t input2_q =
-    {
-        {
-            vld1q_s16(input2),
-            vld1q_s16(input2 + 8),
-        }
-    };
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    // Dequantitize inputs
-    const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-    const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+    const int  window_step_x         = 16 / sizeof(T);
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+    const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
+    const UniformQuantizationInfo tmp_qua_info    = { output_qua_info.scale / scale, output_qua_info.offset };
+
+    if(is_broadcast_across_x)
+    {
+        const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window                        non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+        const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
+
+            const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+
+                // Dequantize inputs
+                const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
+                const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
+
+                const float32x4x4_t out_f32x4x4 =
+                {
+                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                };
+
+                // Quantize output
+                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+                wrapper::vstore(output_ptr + x, result);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                // Dequantize inputs
+                const T     in1     = *(non_broadcast_input_ptr + x);
+                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(in1, non_broadcast_qinfo);
+                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
+                const float tmp_f   = tmp_in1 * tmp_in2;
+
+                // Quantize output
+                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+                *(output_ptr + x)  = tmp_qua;
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();
+        const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();
+
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto input1_q = wrapper::vloadq(input1_ptr + x);
+                const auto input2_q = wrapper::vloadq(input2_ptr + x);
+
+                // Dequantize inputs
+                const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+                const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+
+                const float32x4x4_t out_f32x4x4 =
+                {
+                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                };
+
+                // Quantize output
+                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+                wrapper::vstore(output_ptr + x, result);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                // Dequantize inputs
+                const T     in1     = *(input1_ptr + x);
+                const T     in2     = *(input2_ptr + x);
+                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(in1, input1_qua_info);
+                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(in2, input2_qua_info);
+                const float tmp_f   = tmp_in1 * tmp_in2;
+
+                // Quantize output
+                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+                *(output_ptr + x)  = tmp_qua;
+            }
+        },
+        input1, input2, output);
+    }
+}
+
+void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
+{
+    const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();
+    const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();
+    const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
+
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
 
     const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
 
-    const float32x4x4_t out_f32x4x4 =
+    execute_window_loop(win, [&](const Coordinates &)
     {
-        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-    };
+        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
 
-    const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
-    vst1q_s16(output, result.val[0]);
-    vst1q_s16(output + 8, result.val[1]);
+        // Compute window_step_x elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const qsymm16x8x2_t input1_q =
+            {
+                {
+                    vld1q_s16(input1_ptr + x),
+                    vld1q_s16(input1_ptr + x + 8),
+                }
+            };
+            const qsymm16x8x2_t input2_q =
+            {
+                {
+                    vld1q_s16(input2_ptr + x),
+                    vld1q_s16(input2_ptr + x + 8),
+                }
+            };
+
+            // Dequantize inputs
+            const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+            const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+
+            const float32x4x4_t out_f32x4x4 =
+            {
+                vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+            };
+
+            const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
+            vst1q_s16(output_ptr + x, result.val[0]);
+            vst1q_s16(output_ptr + x + 8, result.val[1]);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            // Dequantize inputs
+            float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
+            float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
+            float tmp_f   = tmp_in1 * tmp_in2;
+
+            // Quantize output, lrintf() has same rounding mode as vcombine_s16
+            int32_t   tmp     = lrintf(tmp_f / tmp_qua_info.scale);
+            qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+            *(output_ptr + x) = tmp_qua;
+        }
+    },
+    input1, input2, output);
 }
 
-void mul_QSYMM16_QSYMM16_S32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale)
+void mul_QSYMM16_QSYMM16_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int scale)
 {
     ARM_COMPUTE_UNUSED(scale);
-    const auto input1 = static_cast<const qsymm16_t *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const qsymm16_t *__restrict>(input2_ptr);
-    const auto output = static_cast<int32_t *__restrict>(output_ptr);
 
-    const qsymm16x8x2_t input1_q =
-    {
-        {
-            vld1q_s16(input1),
-            vld1q_s16(input1 + 8),
-        }
-    };
-    const qsymm16x8x2_t input2_q =
-    {
-        {
-            vld1q_s16(input2),
-            vld1q_s16(input2 + 8),
-        }
-    };
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
 
-    const int32x4x4_t in1_s32 =
-    {
-        {
-            vmovl_s16(vget_low_s16(input1_q.val[0])),
-            vmovl_s16(vget_high_s16(input1_q.val[0])),
-            vmovl_s16(vget_low_s16(input1_q.val[1])),
-            vmovl_s16(vget_high_s16(input1_q.val[1])),
-        }
-    };
-    const int32x4x4_t in2_s32 =
-    {
-        {
-            vmovl_s16(vget_low_s16(input2_q.val[0])),
-            vmovl_s16(vget_high_s16(input2_q.val[0])),
-            vmovl_s16(vget_low_s16(input2_q.val[1])),
-            vmovl_s16(vget_high_s16(input2_q.val[1])),
-        }
-    };
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    const int32x4x4_t result =
-    {
-        {
-            vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
-            vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
-            vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
-            vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
-        }
-    };
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
 
-    vst1q_s32(output, result.val[0]);
-    vst1q_s32(output + 4, result.val[1]);
-    vst1q_s32(output + 8, result.val[2]);
-    vst1q_s32(output + 12, result.val[3]);
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
+
+        // Compute window_step_x elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const qsymm16x8x2_t input1_q =
+            {
+                {
+                    vld1q_s16(input1_ptr + x),
+                    vld1q_s16(input1_ptr + x + 8),
+                }
+            };
+            const qsymm16x8x2_t input2_q =
+            {
+                {
+                    vld1q_s16(input2_ptr + x),
+                    vld1q_s16(input2_ptr + x + 8),
+                }
+            };
+
+            const int32x4x4_t in1_s32 =
+            {
+                {
+                    vmovl_s16(vget_low_s16(input1_q.val[0])),
+                    vmovl_s16(vget_high_s16(input1_q.val[0])),
+                    vmovl_s16(vget_low_s16(input1_q.val[1])),
+                    vmovl_s16(vget_high_s16(input1_q.val[1])),
+                }
+            };
+            const int32x4x4_t in2_s32 =
+            {
+                {
+                    vmovl_s16(vget_low_s16(input2_q.val[0])),
+                    vmovl_s16(vget_high_s16(input2_q.val[0])),
+                    vmovl_s16(vget_low_s16(input2_q.val[1])),
+                    vmovl_s16(vget_high_s16(input2_q.val[1])),
+                }
+            };
+
+            const int32x4x4_t result =
+            {
+                {
+                    vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
+                    vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
+                    vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
+                    vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
+                }
+            };
+
+            vst1q_s32(output_ptr + x, result.val[0]);
+            vst1q_s32(output_ptr + x + 4, result.val[1]);
+            vst1q_s32(output_ptr + x + 8, result.val[2]);
+            vst1q_s32(output_ptr + x + 12, result.val[3]);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            int32_t tmp       = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input1, input2, output);
 }
 
 template <bool is_scale255, bool is_sat>
-void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+void mul_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
 {
-    const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
-    const auto output = static_cast<uint8_t *__restrict>(output_ptr);
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
 
-    const uint8x16_t ta1 = vld1q_u8(input1);
-    const uint8x16_t ta2 = vld1q_u8(input2);
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
-    const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
-    uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
-    const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
 
-    tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
-    tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
+    const int  window_step_x  = 16 / sizeof(uint8_t);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
 
-    if(is_scale255)
+    execute_window_loop(win, [&](const Coordinates &)
     {
-        tmp1_high = scale255_U16_U16(tmp1_high);
-        tmp1_low  = scale255_U16_U16(tmp1_low);
-    }
-    else
-    {
-        const int16x8_t vn = vdupq_n_s16(-n);
+        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
 
-        if(is_sat)
+        // Compute window_step_x elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            tmp1_high = vqshlq_u16(tmp1_high, vn);
-            tmp1_low  = vqshlq_u16(tmp1_low, vn);
-        }
-        else
-        {
-            tmp1_high = vshlq_u16(tmp1_high, vn);
-            tmp1_low  = vshlq_u16(tmp1_low, vn);
-        }
-    }
+            const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
+            const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
 
-    if(is_sat)
-    {
-        vst1q_u8(output, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
-    }
-    else
-    {
-        vst1q_u8(output, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
-    }
+            uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
+            const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
+            uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
+            const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
+
+            tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
+            tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
+
+            if(is_scale255)
+            {
+                tmp1_high = scale255_U16_U16(tmp1_high);
+                tmp1_low  = scale255_U16_U16(tmp1_low);
+            }
+            else
+            {
+                const int16x8_t vn = vdupq_n_s16(-n);
+
+                if(is_sat)
+                {
+                    tmp1_high = vqshlq_u16(tmp1_high, vn);
+                    tmp1_low  = vqshlq_u16(tmp1_low, vn);
+                }
+                else
+                {
+                    tmp1_high = vshlq_u16(tmp1_high, vn);
+                    tmp1_low  = vshlq_u16(tmp1_low, vn);
+                }
+            }
+            if(is_sat)
+            {
+                vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
+            }
+            else
+            {
+                vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
+            }
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
+
+            if(is_scale255)
+            {
+                float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                tmp         = static_cast<uint16_t>(tmp_f + 0.5f);
+            }
+            else
+            {
+                tmp >>= n;
+            }
+            if(is_sat && tmp > 255)
+            {
+                tmp = 255;
+            }
+            *(output_ptr + x) = static_cast<uint8_t>(tmp);
+        }
+    },
+    input1, input2, output);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -468,51 +627,185 @@
 }
 
 template <bool is_scale255, bool is_sat>
-void mul_S16_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+void mul_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
 {
-    const auto input1 = static_cast<const int16_t *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const int16_t *__restrict>(input2_ptr);
-    const auto output = static_cast<int16_t *__restrict>(output_ptr);
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
 
-    const int16x8x2_t ta1 =
-    {
-        {
-            vld1q_s16(input1),
-            vld1q_s16(input1 + 8),
-        }
-    };
-    const int16x8x2_t ta2 =
-    {
-        {
-            vld1q_s16(input2),
-            vld1q_s16(input2 + 8),
-        }
-    };
-    const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    vst1q_s16(output, result.val[0]);
-    vst1q_s16(output + 8, result.val[1]);
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+        // Compute window_step_x elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const int16x8x2_t ta1 =
+            {
+                {
+                    vld1q_s16(input1_ptr + x),
+                    vld1q_s16(input1_ptr + x + 8),
+                }
+            };
+            const int16x8x2_t ta2 =
+            {
+                {
+                    vld1q_s16(input2_ptr + x),
+                    vld1q_s16(input2_ptr + x + 8),
+                }
+            };
+            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+            vst1q_s16(output_ptr + x, result.val[0]);
+            vst1q_s16(output_ptr + x + 8, result.val[1]);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+
+            if(is_scale255)
+            {
+                float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+                tmp = static_cast<int32_t>(tmp_f + 0.5f);
+            }
+            else
+            {
+                if(tmp >= 0)
+                {
+                    tmp >>= n;
+                }
+                else
+                {
+                    uint32_t mask = (1u << n) - 1;
+                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                }
+            }
+            if(is_sat)
+            {
+                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+            }
+            *(output_ptr + x) = static_cast<int16_t>(tmp);
+        }
+    },
+    input1, input2, output);
 }
 
-void mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
+void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
 {
-    const auto input1 = static_cast<const float *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const float *__restrict>(input2_ptr);
-    const auto output = static_cast<float *__restrict>(output_ptr);
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
 
-    const float32x4x4_t ta1       = vld4q_f32(input1);
-    const float32x4x4_t ta2       = vld4q_f32(input2);
-    const float32x4_t   scale_vec = vdupq_n_f32(scale);
-    const float32x4x4_t result =
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    constexpr int window_step_x         = 16 / sizeof(float);
+    const auto    window_start_x        = static_cast<int>(window.x().start());
+    const auto    window_end_x          = static_cast<int>(window.x().end());
+    const bool    is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
+
+    using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
+
+    if(is_broadcast_across_x)
     {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
         {
-            vmulq_f32(vmulq_f32(ta1.val[0], ta2.val[0]), scale_vec),
-            vmulq_f32(vmulq_f32(ta1.val[1], ta2.val[1]), scale_vec),
-            vmulq_f32(vmulq_f32(ta1.val[2], ta2.val[2]), scale_vec),
-            vmulq_f32(vmulq_f32(ta1.val[3], ta2.val[3]), scale_vec)
-        }
-    };
-    vst4q_f32(output, result);
+            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<float *>(output.ptr());
+
+            const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
+            const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+            const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                auto       res             = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
+                wrapper::vstore(output_ptr + x, res);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+            }
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto ta1       = wrapper::vloadq(input1_ptr + x);
+                const auto ta2       = wrapper::vloadq(input2_ptr + x);
+                const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
+                const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
+                wrapper::vstore(output_ptr + x, res);
+            }
+
+            // Compute left-over elements
+            for(; x < window_end_x; ++x)
+            {
+                const auto ta1    = *(input1_ptr + x);
+                const auto ta2    = *(input2_ptr + x);
+                *(output_ptr + x) = ta1 * ta2 * scale;
+            }
+        },
+        input1, input2, output);
+    }
 }
 
 void c_mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr)
@@ -544,161 +837,297 @@
     wrapper::vstore(output, res);
 }
 
-void mul_F16_F16_F16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale)
-{
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    const auto          input1 = static_cast<const float16_t *__restrict>(input1_ptr);
-    const auto          input2 = static_cast<const float16_t *__restrict>(input2_ptr);
-    const auto          output = static_cast<float16_t *__restrict>(output_ptr);
-    const float16x8x2_t ta1 =
+void mul_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
+{
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(win, [&](const Coordinates &)
     {
+        const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+        // Compute window_step_x elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            vld1q_f16(input1),
-            vld1q_f16(input1 + 8),
+            const float16x8x2_t ta1 =
+            {
+                {
+                    vld1q_f16(input1_ptr + x),
+                    vld1q_f16(input1_ptr + x + 8),
+                }
+            };
+            const float16x8x2_t ta2 =
+            {
+                {
+                    vld1q_f16(input2_ptr + x),
+                    vld1q_f16(input2_ptr + x + 8),
+                }
+            };
+            const float16x8_t   scale_vec = vdupq_n_f16(scale);
+            const float16x8x2_t result =
+            {
+                {
+                    vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
+                    vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
+                }
+            };
+            vst1q_f16(output_ptr + x, result.val[0]);
+            vst1q_f16(output_ptr + x + 8, result.val[1]);
         }
-    };
-    const float16x8x2_t ta2 =
-    {
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
         {
-            vld1q_f16(input2),
-            vld1q_f16(input2 + 8),
+            const auto ta1    = *(input1_ptr + x);
+            const auto ta2    = *(input2_ptr + x);
+            *(output_ptr + x) = ta1 * ta2 * scale;
         }
-    };
-    const float16x8_t   scale_vec = vdupq_n_f16(scale);
-    const float16x8x2_t result =
-    {
-        {
-            vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
-            vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
-        }
-    };
-    vst1q_f16(output, result.val[0]);
-    vst1q_f16(output + 8, result.val[1]);
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    ARM_COMPUTE_UNUSED(input1_ptr);
-    ARM_COMPUTE_UNUSED(input2_ptr);
-    ARM_COMPUTE_UNUSED(output_ptr);
-    ARM_COMPUTE_UNUSED(scale);
-    ARM_COMPUTE_ERROR("Not supported. Recompile the library with arch=arm64-v8.2-a.");
+    },
+    input1, input2, output);
+}
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-}
 
 template <bool is_scale255, bool is_sat>
-void mul_U8_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+void mul_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
 {
-    const auto input1 = static_cast<const uint8_t *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
-    const auto output = static_cast<int16_t *__restrict>(output_ptr);
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
 
-    const uint8x16_t bv = vld1q_u8(input2);
-    const uint8x16_t av = vld1q_u8(input1);
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
-    uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
-    tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
-    tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
 
-    if(is_scale255)
+    const int  window_step_x  = 16 / sizeof(uint8_t);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(win, [&](const Coordinates &)
     {
-        tmp_low  = scale255_U16_U16(tmp_low);
-        tmp_high = scale255_U16_U16(tmp_high);
-    }
-    else
-    {
-        const int16x8_t vn = vdupq_n_s16(-n);
+        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
 
-        if(is_sat)
+        // Compute window_step_x elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
-            tmp_low  = vqshlq_u16(tmp_low, vn);
-            tmp_high = vqshlq_u16(tmp_high, vn);
+            const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
+            const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
+
+            uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
+            uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
+            tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
+            tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+
+            if(is_scale255)
+            {
+                tmp_low  = scale255_U16_U16(tmp_low);
+                tmp_high = scale255_U16_U16(tmp_high);
+            }
+            else
+            {
+                const int16x8_t vn = vdupq_n_s16(-n);
+
+                if(is_sat)
+                {
+                    tmp_low  = vqshlq_u16(tmp_low, vn);
+                    tmp_high = vqshlq_u16(tmp_high, vn);
+                }
+                else
+                {
+                    tmp_low  = vshlq_u16(tmp_low, vn);
+                    tmp_high = vshlq_u16(tmp_high, vn);
+                }
+            }
+
+            if(is_sat)
+            {
+                static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
+
+                tmp_low  = vminq_u16(tmp_low, max);
+                tmp_high = vminq_u16(tmp_high, max);
+            }
+
+            vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
+            vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
         }
-        else
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
         {
-            tmp_low  = vshlq_u16(tmp_low, vn);
-            tmp_high = vshlq_u16(tmp_high, vn);
+            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+
+            if(is_scale255)
+            {
+                float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                tmp         = static_cast<int32_t>(tmp_f + 0.5f);
+            }
+            else
+            {
+                tmp >>= n;
+            }
+
+            if(is_sat)
+            {
+                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
+            }
+
+            *(output_ptr + x) = static_cast<int16_t>(tmp);
         }
-    }
-
-    if(is_sat)
-    {
-        static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
-
-        tmp_low  = vminq_u16(tmp_low, max);
-        tmp_high = vminq_u16(tmp_high, max);
-    }
-
-    vst1q_s16(output, vreinterpretq_s16_u16(tmp_low));
-    vst1q_s16(output + 8, vreinterpretq_s16_u16(tmp_high));
+    },
+    input1, input2, output);
 }
 
 template <bool is_scale255, bool is_sat>
-void mul_S16_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+void mul_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
 {
-    const auto input1 = static_cast<const int16_t *__restrict>(input1_ptr);
-    const auto input2 = static_cast<const uint8_t *__restrict>(input2_ptr);
-    const auto output = static_cast<int16_t *__restrict>(output_ptr);
+    // Create input windows
+    Window win        = window;
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
 
-    const int16x8x2_t ta1 =
-    {
-        {
-            vld1q_s16(input1),
-            vld1q_s16(input1 + 8),
-        }
-    };
-    const uint8x8x2_t ta2u =
-    {
-        {
-            vld1_u8(input2),
-            vld1_u8(input2 + 8),
-        }
-    };
-    const int16x8x2_t ta2 =
-    {
-        {
-            vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
-            vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
-        }
-    };
+    // Clear X Dimension on execution window as we handle manually
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+    Iterator input1(in1, input1_win);
+    Iterator input2(in2, input2_win);
+    Iterator output(out, win);
 
-    vst1q_s16(output, result.val[0]);
-    vst1q_s16(output + 8, result.val[1]);
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+        // Compute window_step_x elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const int16x8x2_t ta1 =
+            {
+                {
+                    vld1q_s16(input1_ptr + x),
+                    vld1q_s16(input1_ptr + x + 8),
+                }
+            };
+            const uint8x8x2_t ta2u =
+            {
+                {
+                    vld1_u8(input2_ptr + x),
+                    vld1_u8(input2_ptr + x + 8),
+                }
+            };
+            const int16x8x2_t ta2 =
+            {
+                {
+                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
+                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
+                }
+            };
+
+            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+
+            vst1q_s16(output_ptr + x, result.val[0]);
+            vst1q_s16(output_ptr + x + 8, result.val[1]);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+
+            if(is_scale255)
+            {
+                float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+                tmp = static_cast<int32_t>(tmp_f + 0.5f);
+            }
+            else
+            {
+                if(tmp >= 0)
+                {
+                    tmp >>= n;
+                }
+                else
+                {
+                    uint32_t mask = (1u << n) - 1;
+                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                }
+            }
+            if(is_sat)
+            {
+                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+            }
+            *(output_ptr + x) = static_cast<int16_t>(tmp);
+        }
+    },
+    input1, input2, output);
 }
 
 template <bool is_scale255, bool is_sat>
-void mul_U8_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n)
+void mul_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
 {
     // Simply swap the two input buffers
-    mul_S16_U8_S16_n<is_scale255, is_sat>(input2_ptr, input1_ptr, output_ptr, n);
+    mul_S16_U8_S16<is_scale255, is_sat>(in2, in1, out, window, n);
 }
 } // namespace
 
 NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
-    : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 }, _run_optimized_qasymm8(false)
+    : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
 {
 }
 
-void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
 {
     ARM_COMPUTE_UNUSED(rounding_policy);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
 
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
+    const TensorShape &out_shape    = broadcast_pair.first;
+    const ValidRegion &valid_region = broadcast_pair.second;
 
-    _input1                = input1;
-    _input2                = input2;
-    _output                = output;
-    _scale                 = scale;
-    _scale_exponent        = 0;
-    _func_quantized        = nullptr;
-    _func_int              = nullptr;
-    _func_float            = nullptr;
-    _run_optimized_qasymm8 = false;
+    // Auto initialize output if not initialized
+    set_shape_if_empty(*output, out_shape);
+
+    _scale          = scale;
+    _scale_exponent = 0;
+    _func_quantized = nullptr;
+    _func_int       = nullptr;
+    _func_float     = nullptr;
 
     bool is_scale_255 = false;
     // Check and validate scaling factor
@@ -717,98 +1146,114 @@
         _scale_exponent = std::abs(exponent - 1);
     }
 
-    const DataType dt_input1 = input1->info()->data_type();
-    const DataType dt_input2 = input2->info()->data_type();
-    const DataType dt_output = output->info()->data_type();
+    const DataType dt_input1 = input1->data_type();
+    const DataType dt_input2 = input2->data_type();
+    const DataType dt_output = output->data_type();
     const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
 
-    if(dt_input1 == DataType::QASYMM8 && dt_input2 == DataType::QASYMM8)
+    switch(dt_input1)
     {
-        _run_optimized_qasymm8 = true;
-    }
-    else if(dt_input1 == DataType::QASYMM8_SIGNED && dt_input2 == DataType::QASYMM8_SIGNED)
-    {
-        _func_quantized = &mul_saturate_QASYMM8_SIGNED_QASYMM8_SIGNED_QASYMM8_SIGNED_n;
-    }
-    else if(dt_input1 == DataType::QSYMM16 && dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
-    {
-        _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16_n;
-    }
-    else if(dt_input1 == DataType::QSYMM16 && dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
-    {
-        _func_int = &mul_QSYMM16_QSYMM16_S32_n;
-    }
-    else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output)
-    {
-        if(is_scale_255)
-        {
-            _func_int = is_sat ? &mul_U8_U8_U8_n<true, true> : &mul_U8_U8_U8_n<true, false>;
-        }
-        else
-        {
-            _func_int = is_sat ? &mul_U8_U8_U8_n<false, true> : &mul_U8_U8_U8_n<false, false>;
-        }
-    }
-    else if(DataType::S16 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output)
-    {
-        if(is_scale_255)
-        {
-            _func_int = is_sat ? &mul_S16_S16_S16_n<true, true> : &mul_S16_S16_S16_n<true, false>;
-        }
-        else
-        {
-            _func_int = is_sat ? &mul_S16_S16_S16_n<false, true> : &mul_S16_S16_S16_n<false, false>;
-        }
-    }
-    else if(DataType::S16 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output)
-    {
-        if(is_scale_255)
-        {
-            _func_int = is_sat ? &mul_S16_U8_S16_n<true, true> : &mul_S16_U8_S16_n<true, false>;
-        }
-        else
-        {
-            _func_int = is_sat ? &mul_S16_U8_S16_n<false, true> : &mul_S16_U8_S16_n<false, false>;
-        }
-    }
-    else if(DataType::U8 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output)
-    {
-        if(is_scale_255)
-        {
-            _func_int = is_sat ? &mul_U8_S16_S16_n<true, true> : &mul_U8_S16_S16_n<true, false>;
-        }
-        else
-        {
-            _func_int = is_sat ? &mul_U8_S16_S16_n<false, true> : &mul_U8_S16_S16_n<false, false>;
-        }
-    }
-    else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output)
-    {
-        if(is_scale_255)
-        {
-            _func_int = is_sat ? &mul_U8_U8_S16_n<true, true> : &mul_U8_U8_S16_n<true, false>;
-        }
-        else
-        {
-            _func_int = is_sat ? &mul_U8_U8_S16_n<false, true> : &mul_U8_U8_S16_n<false, false>;
-        }
-    }
-    else if(DataType::F16 == dt_input1 && DataType::F16 == dt_input2 && DataType::F16 == dt_output)
-    {
-        _func_float = &mul_F16_F16_F16_n;
-        _func_int   = nullptr;
-    }
-    else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output)
-    {
-        _func_float = &mul_F32_F32_F32_n;
-        _func_int   = nullptr;
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("You called with the wrong img formats");
+        case DataType::QASYMM8:
+            if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
+            {
+                _func_quantized = &mul_saturate_quantized_8<uint8_t>;
+            }
+            break;
+        case DataType::QASYMM8_SIGNED:
+            if(dt_input2 == DataType::QASYMM8_SIGNED)
+            {
+                _func_quantized = &mul_saturate_quantized_8<int8_t>;
+                ;
+            }
+            break;
+        case DataType::QSYMM16:
+            if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
+            {
+                _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
+            }
+            else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
+            {
+                _func_int = &mul_QSYMM16_QSYMM16_S32;
+            }
+            break;
+        case DataType::S16:
+            if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+            {
+                if(is_scale_255)
+                {
+                    _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
+                }
+                else
+                {
+                    _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
+                }
+            }
+            if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+            {
+                if(is_scale_255)
+                {
+                    _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
+                }
+                else
+                {
+                    _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
+                }
+            }
+            break;
+        case DataType::U8:
+            if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+            {
+                if(is_scale_255)
+                {
+                    _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
+                }
+                else
+                {
+                    _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
+                }
+            }
+            else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+            {
+                if(is_scale_255)
+                {
+                    _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
+                }
+                else
+                {
+                    _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
+                }
+            }
+            else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+            {
+                if(is_scale_255)
+                {
+                    _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
+                }
+                else
+                {
+                    _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
+                }
+            }
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func_float = &mul_F16_F16_F16;
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        case DataType::F32:
+            _func_float = &mul_F32_F32_F32;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("You called with the wrong img formats");
     }
 
-    INEKernel::configure(win_config.second);
+    // Configure kernel window
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(valid_region);
+    Window win = calculate_max_window(valid_region, Steps());
+
+    INEKernel::configure(win);
 }
 
 Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
@@ -816,108 +1261,34 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
 
     return Status{};
 }
 
-void NEPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
+void NEPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-    const TensorShape &out_shape = _output->info()->tensor_shape();
+    auto input1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto input2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto output = tensors.get_tensor(TensorType::ACL_DST);
 
-    bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    if(_func_quantized != nullptr)
     {
-        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
-        {
-            can_collapse = (in_shape1[d] == in_shape2[d]);
-        }
-    }
-
-    bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(INEKernel::window(), Window::DimZ, &has_collapsed) : window;
-
-    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-    Window slice        = collapsed.first_slice_window_3D();
-    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-    Iterator input1(_input1, slice_input1);
-    Iterator input2(_input2, slice_input2);
-    Iterator output(_output, slice);
-
-    if((_run_optimized_qasymm8) || (_func_quantized != nullptr))
-    {
-        if(_run_optimized_qasymm8)
-        {
-            const int32x4_t   input1_voffset = vdupq_n_s32(_input1->info()->quantization_info().uniform().offset);
-            const float32x4_t input1_vscale  = vdupq_n_f32(_input1->info()->quantization_info().uniform().scale);
-            const int32x4_t   input2_voffset = vdupq_n_s32(_input2->info()->quantization_info().uniform().offset);
-            const float32x4_t input2_vscale  = vdupq_n_f32(_input2->info()->quantization_info().uniform().scale);
-            const float32x4_t output_voffset = vdupq_n_f32(static_cast<float>(_output->info()->quantization_info().uniform().offset));
-            const float       output_scale   = _output->info()->quantization_info().uniform().scale;
-            const float32x4_t vinvscale      = vdupq_n_f32(1.f / (output_scale / _scale));
-
-            execute_window_loop(collapsed, [&](const Coordinates &)
-            {
-                mul_saturate_QASYMM8_QASYMM8_QASYMM8_n_opt(input1.ptr(), input2.ptr(), output.ptr(),
-                                                           input1_vscale, input1_voffset, input2_vscale, input2_voffset, output_voffset, vinvscale);
-                ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
-                ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-            },
-            input1, input2, output);
-        }
-        else
-        {
-            execute_window_loop(collapsed, [&](const Coordinates &)
-            {
-                (*_func_quantized)(input1.ptr(), input2.ptr(), output.ptr(), _scale,
-                                   _input1->info()->quantization_info().uniform(), _input2->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform());
-                ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
-                ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-            },
-            input1, input2, output);
-        }
+        (*_func_quantized)(input1, input2, output, window, _scale);
     }
     else if(_func_int != nullptr)
     {
-        execute_window_loop(collapsed, [&](const Coordinates &)
-        {
-            (*_func_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent);
-            ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
-            ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-        },
-        input1, input2, output);
+        (*_func_int)(input1, input2, output, window, _scale_exponent);
     }
     else
     {
         ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
-        execute_window_loop(collapsed, [&](const Coordinates &)
-        {
-            (*_func_float)(input1.ptr(), input2.ptr(), output.ptr(), _scale);
-            ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
-            ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-        },
-        input1, input2, output);
+        (*_func_float)(input1, input2, output, window, _scale);
     }
 }
-
-BorderSize NEPixelWiseMultiplicationKernel::border_size() const
-{
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize{ 0, border, 0, 0 };
-}
-
 namespace
 {
 constexpr unsigned int num_elems_processed_per_iteration_complex = 2;
@@ -970,24 +1341,15 @@
 }
 } // namespace
 
-NEComplexPixelWiseMultiplicationKernel::NEComplexPixelWiseMultiplicationKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void NEComplexPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1->info(), input2->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output));
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window_complex(input1->info(), input2->info(), output->info());
+    auto win_config = validate_and_configure_window_complex(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
     // Create kernel
     INEKernel::configure(win_config.second);
 }
@@ -1001,27 +1363,24 @@
     return Status{};
 }
 
-void NEComplexPixelWiseMultiplicationKernel::run(const Window &window, const ThreadInfo &info)
+void NEComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Iterator input1(_input1, window.broadcast_if_dimension_le_one(_input1->info()->tensor_shape()));
-    Iterator input2(_input2, window.broadcast_if_dimension_le_one(_input2->info()->tensor_shape()));
-    Iterator output(_output, window);
+    auto input1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto input2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto output = tensors.get_tensor(TensorType::ACL_DST);
+
+    Iterator input1_it(input1, window.broadcast_if_dimension_le_one(input1->info()->tensor_shape()));
+    Iterator input2_it(input2, window.broadcast_if_dimension_le_one(input2->info()->tensor_shape()));
+    Iterator output_it(output, window);
 
     execute_window_loop(window, [&](const Coordinates &)
     {
-        c_mul_F32_F32_F32_n(input1.ptr(), input2.ptr(), output.ptr());
+        c_mul_F32_F32_F32_n(input1_it.ptr(), input2_it.ptr(), output_it.ptr());
     },
-    input1, input2, output);
-}
-
-BorderSize NEComplexPixelWiseMultiplicationKernel::border_size() const
-{
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration_complex - 1U, replicateSize);
-    return { 0, border, 0, 0 };
+    input1_it, input2_it, output_it);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 6d61f51..efd0aff 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,6 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 #include "support/ToolchainSupport.h"
 
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
@@ -137,6 +136,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     if(indices)
     {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
     }
@@ -156,7 +156,6 @@
         if(indices)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-
             ARM_COMPUTE_RETURN_ERROR_ON((indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::WIDTH)) != pooled_w)
                                         || (indices->dimension(get_data_layout_dimension_index(indices->data_layout(), DataLayoutDimension::HEIGHT)) != pooled_h));
         }
@@ -727,6 +726,40 @@
 }
 
 template <typename T>
+inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y)
+{
+    const int pad_left    = info.padding().left;
+    const int pad_right   = info.padding().right;
+    const int pad_top     = info.padding().top;
+    const int pad_bottom  = info.padding().bottom;
+    const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
+    const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
+    const int pad_horiz   = pad_left + pad_right;
+    const int pad_vert    = pad_top + pad_bottom;
+
+    if(info.data_layout() == DataLayout::NCHW)
+    {
+        const uint32_t offset_base = padded_offset
+                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_y                                            /* subtract padding elems per row */
+                                     - pad_top * sizeof(T)                                                                       /* top padding */
+                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
+                                     - in_stride_w * id[3];
+
+        return offset_base;
+    }
+    else
+    {
+        const uint32_t offset_base = padded_offset
+                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_x                          // subtract padding elems per row
+                                     - pad_top * sizeof(T)                                                     // top padding
+                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
+                                     - in_stride_w * id[3];
+
+        return offset_base;
+    }
+}
+
+template <typename T>
 void NEPoolingLayerKernel::pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     Iterator input(_input, window_input);
@@ -925,63 +958,130 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+template <typename T>
+inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
+f16_to_f32(float16x4_t input)
+{
+    float32x2_t output = { static_cast<float>(vget_lane_f16(input, 0)), static_cast<float>(vget_lane_f16(input, 1)) };
+    return output;
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+template <typename T>
+inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
+f16_to_f32(float32x2_t input)
+{
+    return input;
+}
+
+template <typename T>
+void NEPoolingLayerKernel::pooling2_nchw_maxpool_indices(const Window &window_input, const Window &window)
+{
+    Iterator  input(_input, window_input);
+    Iterator  output(_output, window);
+    Iterator  indices(_indices, window);
+    const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
+    int       pool_stride_x = 0;
+    int       pool_stride_y = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
+    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const int            pad_left         = _input->info()->padding().left;
+    const int            pad_right        = _input->info()->padding().right;
+    const int            in_stride_y      = static_cast<int>(_input->info()->strides_in_bytes().y());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        auto        top_data        = wrapper::vload(reinterpret_cast<const T *>(input_top_ptr + input.offset()));
+        auto        bottom_data     = wrapper::vload(reinterpret_cast<const T *>(input_bottom_ptr + input.offset()));
+        float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
+        float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
+
+        // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
+        const float32x2_t max_data_top         = vpmax_f32(top_data_f32, top_data_f32);
+        const float32x2_t max_data_bottom      = vpmax_f32(bottom_data_f32, bottom_data_f32);
+        const float32x2_t max_data             = vmax_f32(max_data_top, max_data_bottom);
+        *(reinterpret_cast<T *>(output.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
+
+        // Calculate max data indice, which will be used in max unpool.
+        const uint32_t   offset_base              = offset_no_padding<T>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+        const uint32_t   offset_top               = (uint32_t)(offset_base / sizeof(T));
+        const uint32_t   offset_bottom            = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
+        const uint32x2_t voffset_top              = { offset_top, offset_top + 1u };
+        const uint32x2_t voffset_bottom           = { offset_bottom, offset_bottom + 1u };
+        const uint32x2_t tmp_indices_top          = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
+        const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
+        *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
+    },
+    input, output, indices);
+}
+
 void NEPoolingLayerKernel::pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     ARM_COMPUTE_UNUSED(pooling_type);
     ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    Iterator      input(_input, window_input);
-    Iterator      output(_output, window);
-    constexpr int pool_size       = 2;
-    const int     pool_pad_right  = _pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = _pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = _pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
-    int           pool_stride_x, pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y)     = _pool_info.pad_stride_info.stride();
-    const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
-
-    const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    if(pooling_type == PoolingType::MAX && _indices)
     {
-        float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
-        float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
-        float16x4_t res         = {};
+        pooling2_nchw_maxpool_indices<float16_t>(window_input, window);
+    }
+    else
+    {
+        Iterator      input(_input, window_input);
+        Iterator      output(_output, window);
+        constexpr int pool_size       = 2;
+        const int     pool_pad_right  = _pool_info.pad_stride_info.pad_right();
+        const int     pool_pad_top    = _pool_info.pad_stride_info.pad_top();
+        const int     pool_pad_left   = _pool_info.pad_stride_info.pad_left();
+        const int     pool_pad_bottom = _pool_info.pad_stride_info.pad_bottom();
+        int           pool_stride_x, pool_stride_y = 0;
+        std::tie(pool_stride_x, pool_stride_y)     = _pool_info.pad_stride_info.stride();
+        const int upper_bound_w = _input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_right);
+        const int upper_bound_h = _input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_bottom);
 
-        // Get power of 2 in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
+        const unsigned char *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            top_data    = vmul_f16(top_data, top_data);
-            bottom_data = vmul_f16(bottom_data, bottom_data);
-        }
+            float16x4_t top_data    = vld1_f16(reinterpret_cast<const float16_t *>(input_top_ptr + input.offset()));
+            float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(input_bottom_ptr + input.offset()));
+            float16x4_t res         = {};
 
-        if(pooling_type != PoolingType::MAX)
-        {
-            const float       scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            const float16x4_t scale_v = vdup_n_f16(scale);
+            // Get power of 2 in case of l2 pooling
+            if(pooling_type == PoolingType::L2)
+            {
+                top_data    = vmul_f16(top_data, top_data);
+                bottom_data = vmul_f16(bottom_data, bottom_data);
+            }
 
-            const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
-            res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
-        }
-        else
-        {
-            const float16x4_t max_data = vmax_f16(top_data, bottom_data);
-            res                        = vpmax_f16(max_data, max_data);
-        }
+            if(pooling_type != PoolingType::MAX)
+            {
+                const float       scale   = calculate_avg_scale(exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                const float16x4_t scale_v = vdup_n_f16(scale);
 
-        // Calculate square-root in case of l2 pooling
-        if(pooling_type == PoolingType::L2)
-        {
-            res = vinv_f16(vinvsqrt_f16(res));
-        }
+                const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
+                res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
+            }
+            else
+            {
+                const float16x4_t max_data = vmax_f16(top_data, bottom_data);
+                res                        = vpmax_f16(max_data, max_data);
+            }
 
-        // Store result
-        *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
-    },
-    input, output);
+            // Calculate square-root in case of l2 pooling
+            if(pooling_type == PoolingType::L2)
+            {
+                res = vinv_f16(vinvsqrt_f16(res));
+            }
+
+            // Store result
+            *(reinterpret_cast<float16_t *>(output.ptr())) = vget_lane_f16(res, 0);
+        },
+        input, output);
+    }
 #else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
     ARM_COMPUTE_UNUSED(window_input);
     ARM_COMPUTE_UNUSED(window);
@@ -1268,11 +1368,95 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 }
 
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+void NEPoolingLayerKernel::pooling2_f16_nhwc_maxpool_indices(const Window &window_input, const Window &window)
+{
+    Iterator input(_input, window_input);
+    Iterator output(_output, window);
+    Iterator indices(_indices, window);
+
+    const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
+
+    int pool_stride_x = 0;
+    int pool_stride_y = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
+
+    const int pad_right   = _input->info()->padding().right;
+    const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
+    const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const int idx_width    = id.y() * pool_stride_x;
+        const int idx_height   = id.z() * pool_stride_y;
+        const int pool_limit_y = pool_pad_top - idx_height;
+        const int pool_limit_x = pool_pad_left - idx_width;
+
+        const int pool_start_y = std::max(0, window_input.z().start() + pool_limit_y);
+        const int pool_start_x = std::max(0, window_input.y().start() + pool_limit_x);
+        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
+                                 (_input->info()->strides_in_bytes().z());
+        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
+                                 (_input->info()->strides_in_bytes().z());
+
+        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
+                                 (_input->info()->strides_in_bytes().z());
+
+        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(_input->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
+                                 (_input->info()->strides_in_bytes().z());
+
+        const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x0_offset);
+        const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x1_offset);
+        const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x2_offset);
+        const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(input.ptr() + in_x3_offset);
+        const auto  v_x0      = vld1q_f16(in_x0_ptr);
+        const auto  v_x1      = vld1q_f16(in_x1_ptr);
+        const auto  v_x2      = vld1q_f16(in_x2_ptr);
+        const auto  v_x3      = vld1q_f16(in_x3_ptr);
+        float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
+        // Store result
+        vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vres);
+
+        const uint32_t   offset_base    = offset_no_padding<float16_t>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+        const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t);
+        const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
+        const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * _input->info()->tensor_shape()[1];
+        const uint32_t   offset_x3      = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_right;
+        const uint32x4_t voffset_x0_0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
+        const uint32x4_t voffset_x0_1   = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
+        const uint16x8_t voffset_x0     = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
+        const uint32x4_t voffset_x1_0   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
+        const uint32x4_t voffset_x1_1   = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
+        const uint16x8_t voffset_x1     = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
+        const uint32x4_t voffset_x2_0   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
+        const uint32x4_t voffset_x2_1   = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
+        const uint16x8_t voffset_x2     = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
+        const uint32x4_t voffset_x3_0   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
+        const uint32x4_t voffset_x3_1   = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
+        const uint16x8_t voffset_x3     = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
+        const uint16x8_t tmp_indices0   = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
+        const uint16x8_t tmp_indices1   = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
+        const uint16x8_t tmp_indices2   = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+        const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
+        const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
+        // Store indicies
+        vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()), tmp_indeces3_0);
+        vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16), tmp_indeces3_1);
+    },
+    input, output, indices);
+}
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
 void NEPoolingLayerKernel::poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding)
 {
     ARM_COMPUTE_UNUSED(pooling_type);
     ARM_COMPUTE_UNUSED(exclude_padding);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    if(_pool_info.pool_size == Size2D(2, 2) && pooling_type == PoolingType::MAX && _indices)
+    {
+        pooling2_f16_nhwc_maxpool_indices(window_input, window);
+    }
     Iterator input(_input, window_input);
     Iterator output(_output, window);
 
@@ -1489,55 +1673,12 @@
     input, output);
 }
 
-void NEPoolingLayerKernel::pooling2_f32_nchw_maxpool_indices(const Window &window_input, const Window &window)
-{
-    Iterator  input(_input, window_input);
-    Iterator  output(_output, window);
-    Iterator  indices(_indices, window);
-    int       final_index   = 0;
-    const int pool_pad_top  = _pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left = _pool_info.pad_stride_info.pad_left();
-    int       pool_stride_x = 0;
-    int       pool_stride_y = 0;
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-    const uint8_t *const input_top_ptr    = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-
-    const Strides &input_strides = _input->info()->strides_in_bytes();
-    const auto     in_stridew    = input_strides[1];
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const auto        input_offset_top    = input_top_ptr + input.offset();
-        const auto        input_offset_bottom = input_bottom_ptr + input.offset();
-        const auto        in_top_ptr          = reinterpret_cast<const float *>(input_offset_top);
-        const auto        in_bottom_ptr       = reinterpret_cast<const float *>(input_offset_bottom);
-        float32x2_t       top_data            = vld1_f32(in_top_ptr);
-        float32x2_t       bottom_data         = vld1_f32(in_bottom_ptr);
-        float32x2_t       res                 = {};
-        float             final_res           = 0;
-        const float32x2_t max_data            = vmax_f32(top_data, bottom_data);
-        res                                   = vpmax_f32(max_data, max_data);
-        final_res                             = vget_lane_f32(res, 0);
-        // Store result
-        *(reinterpret_cast<float *>(output.ptr())) = final_res;
-        const uint32_t   offset_top                = (uint32_t)(input.offset() / sizeof(float));
-        const uint32_t   offset_bottom             = (uint32_t)offset_top + (in_stridew / sizeof(float));
-        const uint32x2_t voffset_top               = { offset_top, offset_top + 1u };
-        const uint32x2_t voffset_bottom            = { offset_bottom, offset_bottom + 1u };
-        const uint32x2_t tmp_indices               = vbsl_u32(vcgt_f32(top_data, bottom_data), voffset_top, voffset_bottom);
-        final_index                                = vget_lane_u32(vbsl_u32(vcgt_f32(max_data, vrev64_f32(max_data)), tmp_indices, vrev64_u32(tmp_indices)), 0);
-        *(reinterpret_cast<int *>(indices.ptr()))  = final_index;
-    },
-    input, output, indices);
-}
-
 void NEPoolingLayerKernel::pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type,
                                              bool exclude_padding)
 {
     if(pooling_type == PoolingType::MAX && _indices)
     {
-        pooling2_f32_nchw_maxpool_indices(window_input, window);
+        pooling2_nchw_maxpool_indices<float>(window_input, window);
     }
     else
     {
@@ -1867,10 +2008,8 @@
     float32x4_t vres;
 
     const int pad_right   = _input->info()->padding().right;
-    const int pad_top     = _input->info()->padding().top;
     const int in_stride_y = static_cast<int>(_input->info()->strides_in_bytes().y());
     const int in_stride_z = static_cast<int>(_input->info()->strides_in_bytes().z());
-    const int in_stride_w = static_cast<int>(_input->info()->strides_in_bytes()[3]);
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -1904,27 +2043,20 @@
         // Store result
         vst1q_f32(reinterpret_cast<float *>(output.ptr()), vres);
 
-        const uint32_t offset_base = input.offset()
-                                     - sizeof(float) * pad_right * id.y() * pool_stride_x                                     /* subtract padding elems per row */
-                                     - pad_top * sizeof(float)                                                                /* top padding */
-                                     - sizeof(float) * pad_right * _input->info()->tensor_shape()[1] * id.z() * pool_stride_y /* for each Z plane there are width*pad_right padding elems */
-                                     - in_stride_w * id[3] + _input->info()->tensor_shape()[0] * sizeof(float) * id[3];
-
-        const uint32_t offset_x0 = (uint32_t)offset_base / sizeof(float);
-        const uint32_t offset_x1 = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
-        const uint32_t offset_x2 = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
-        const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
-
+        const uint32_t   offset_base  = offset_no_padding<float>(input.offset(), id, *_input->info(), pool_stride_x, pool_stride_y);
+        const uint32_t   offset_x0    = (uint32_t)offset_base / sizeof(float);
+        const uint32_t   offset_x1    = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
+        const uint32_t   offset_x2    = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * _input->info()->tensor_shape()[1];
+        const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float) - pad_right;
         const uint32x4_t voffset_x0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
         const uint32x4_t voffset_x1   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
         const uint32x4_t voffset_x2   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
         const uint32x4_t voffset_x3   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-        const uint32x4_t tmp_indices0 = vbslq_u32(vcgtq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
-        const uint32x4_t tmp_indices1 = vbslq_u32(vcgtq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
-        const uint32x4_t tmp_indices2 = vbslq_u32(vcgtq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-
+        const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
+        const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
+        const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+        // Store indices
         vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()), tmp_indices2);
-
     },
     input, output, indices);
 }

diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index d830d0d..808b68a 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,6 @@
 #include "arm_compute/core/Validate.h"
 
 #include <arm_neon.h>
-#include <cstdint>
 
 namespace arm_compute
 {
@@ -68,6 +67,7 @@
     if(output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
     }
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index 29ffee8..cbfbda7 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index 113abad..26ba401 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index 3b944ab..955cdc2 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,34 +72,6 @@
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = compute_roi_align_shape(*input, *rois, pool_info);
-    auto_init_if_empty((*output), output_shape, 1, input->data_type());
-    output->set_data_layout(input->data_layout());
-
-    const unsigned int num_rois = rois->dimension(1);
-    Window             window;
-    window.set(Window::DimX, Window::Dimension(0, num_rois));
-    window.set(Window::DimY, Window::Dimension(0, 1));
-
-    AccessWindowStatic input_access(input,
-                                    input->valid_region().start(0),
-                                    input->valid_region().start(1),
-                                    input->valid_region().end(0),
-                                    input->valid_region().end(1));
-    AccessWindowStatic output_access(output, 0, 0, pool_info.pooled_width(), pool_info.pooled_height());
-
-    const bool window_changed = update_window_and_padding(window, input_access, output_access);
-    output_access.set_valid_region(window, ValidRegion(Coordinates(), output->tensor_shape()));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, window);
-}
 } // namespace
 
 NEROIAlignLayerKernel::NEROIAlignLayerKernel()
@@ -111,9 +83,20 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
+    // Output auto inizialitation if not yet initialized
+    const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info);
+    auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    output->info()->set_data_layout(input->info()->data_layout());
+
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), rois->info(), output->info(), pool_info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    const unsigned int num_rois = rois->info()->dimension(1);
+    Window             window;
+    window.set(Window::DimX, Window::Dimension(0, num_rois));
+    window.set(Window::DimY, Window::Dimension(0, 1));
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
 
     // Set instance variables
     _input     = input;
@@ -121,7 +104,7 @@
     _output    = output;
     _pool_info = pool_info;
 
-    INEKernel::configure(win_config.second);
+    INEKernel::configure(window);
 }
 
 Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)

diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 708420c..6a960c7 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,18 +23,13 @@
  */
 #include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "support/ToolchainSupport.h"
 
 #include <cfloat>
-#include <cmath>
 
 namespace arm_compute
 {
@@ -53,7 +48,7 @@
     ARM_COMPUTE_ERROR_ON(rois->info()->dimension(0) != 5);
     ARM_COMPUTE_ERROR_ON(rois->info()->num_dimensions() > 2);
     ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
     if(output->info()->total_size() != 0)
@@ -82,15 +77,10 @@
     window.set(Window::DimX, Window::Dimension(0, rois->info()->dimension(1)));
     window.set(Window::DimY, Window::Dimension(0, 1));
 
-    AccessWindowStatic input_access(input->info(),
-                                    input->info()->valid_region().start(0),
-                                    input->info()->valid_region().start(1),
-                                    input->info()->valid_region().end(0),
-                                    input->info()->valid_region().end(1));
-    AccessWindowStatic output_access(output->info(), 0, 0, pool_info.pooled_width(), pool_info.pooled_height());
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
 
-    ARM_COMPUTE_UNUSED(update_window_and_padding(window, input_access, output_access));
-    output_access.set_valid_region(window, ValidRegion(Coordinates(), output->info()->tensor_shape()));
     INEKernel::configure(window);
 }
 

diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index fedb923..c8a456a 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index afe58ed..5a52216 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
index 3c871de..2881161 100644
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ b/src/core/NEON/kernels/NERemapKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index f8a8732..317bc25 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index 53fcfd7..23b349b 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,13 +31,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
 #include <cstdint>
 
 /** [NEReshapeLayerKernel Kernel] **/
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
@@ -71,56 +72,57 @@
 }
 } // namespace
 
-void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output)
+void NEReshapeLayerKernel::configure(const ITensorInfo *input, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output));
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info());
+    Window win = calculate_max_window(*input);
 
     // Set the output valid region
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
 
     INEKernel::configure(win);
 }
 
+void NEReshapeLayerKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
+    switch(src->info()->data_type())
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+            reshape_tensor<uint8_t>(window, src, dst);
+            break;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::F16:
+            reshape_tensor<uint16_t>(window, src, dst);
+            break;
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            reshape_tensor<uint32_t>(window, src, dst);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type!");
+    }
+}
+
 Status NEReshapeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
 
     return Status{};
 }
-
-void NEReshapeLayerKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    switch(_input->info()->data_type())
-    {
-        case DataType::U8:
-        case DataType::S8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-            reshape_tensor<uint8_t>(window, _input, _output);
-            break;
-        case DataType::U16:
-        case DataType::S16:
-        case DataType::F16:
-            reshape_tensor<uint16_t>(window, _input, _output);
-            break;
-        case DataType::U32:
-        case DataType::S32:
-        case DataType::F32:
-            reshape_tensor<uint32_t>(window, _input, _output);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type!");
-    }
-}
+} // namespace arm_compute
 /** [NEReshapeLayerKernel Kernel] **/

diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index 5a8c446..2e6135b 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,24 +23,11 @@
  */
 #include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/NEMath.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include <arm_neon.h>
-#include <array>
-#include <cmath>
-#include <map>
-
 namespace arm_compute
 {
 namespace
@@ -48,7 +35,7 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
@@ -159,28 +146,19 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_input->info()->data_type())
+    switch(_input->info()->element_size())
     {
-        case DataType::F32:
-        case DataType::U32:
-        case DataType::S32:
+        case 4:
             run_reverse<uint32_t>(window, _input, _axis, _output);
             break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::S16:
-        case DataType::U16:
+        case 2:
             run_reverse<uint16_t>(window, _input, _axis, _output);
             break;
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::U8:
-        case DataType::S8:
+        case 1:
             run_reverse<uint8_t>(window, _input, _axis, _output);
             break;
         default:
-            ARM_COMPUTE_ERROR("Data type not supported");
+            ARM_COMPUTE_ERROR("Element size not supported");
     }
 }
 } // namespace arm_compute
\ No newline at end of file

diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 4f2f925..94fcfe2 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,8 +28,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
 
+#include "src/core/utils/ScaleUtils.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -37,17 +40,16 @@
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
-                          const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
-                          BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
+                          const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output == input);
-    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
-    ARM_COMPUTE_RETURN_ERROR_ON(!use_padding && border_mode != BorderMode::CONSTANT);
-    ARM_COMPUTE_UNUSED(constant_border_value);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+    ARM_COMPUTE_RETURN_ERROR_ON(!info.use_padding && info.border_mode != BorderMode::CONSTANT);
+    ARM_COMPUTE_UNUSED(info.constant_border_value);
 
     const DataLayout data_layout   = input->data_layout();
     const auto       width_index   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -57,31 +59,21 @@
     ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0);
 
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
     }
 
-    if(policy == InterpolationPolicy::BILINEAR)
+    if(info.interpolation_policy == InterpolationPolicy::BILINEAR)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
-
-        if(align_corners)
-        {
-            // For bilinear method with aligned corners, the resize ratio will
-            // be calculated by (input_size - 1)/(output_size - 1). Belows are
-            // checking possible overflows.
-            const auto input_width  = input->dimension(width_index);
-            const auto input_height = input->dimension(height_index);
-
-            ARM_COMPUTE_RETURN_ERROR_ON(input_width == 0 || input_height == 0);
-            ARM_COMPUTE_RETURN_ERROR_ON((output_width - 1 == 0) || (output_height - 1 == 0));
-        }
     }
 
-    if(policy == InterpolationPolicy::AREA)
+    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+
+    if(info.interpolation_policy == InterpolationPolicy::AREA)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
@@ -91,7 +83,7 @@
 }
 
 std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
-                                                             InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy, BorderSize border_size)
+                                                             const ScaleKernelInfo &info, BorderSize border_size)
 {
     bool   window_changed{ false };
     Window win{};
@@ -123,30 +115,28 @@
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
     output_access.set_valid_region(win, calculate_valid_region_scale(*input, output->tensor_shape(),
-                                                                     policy, sampling_policy, border_undefined));
+                                                                     info.interpolation_policy, info.sampling_policy, info.border_mode == BorderMode::UNDEFINED));
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 
-std::pair<Status, Window> validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output,
-                                                             InterpolationPolicy policy, bool border_undefined,
-                                                             SamplingPolicy sampling_policy, BorderSize border_size, bool use_padding)
+std::pair<Status, Window> validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output, const ScaleKernelInfo &info, BorderSize border_size)
 {
     bool   window_changed{ false };
     Window win{};
 
-    const unsigned int num_elems_processed_per_iteration = (use_padding && policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1;
+    const unsigned int num_elems_processed_per_iteration = (info.use_padding && info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1;
 
     // Configure kernel window
     win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-    if(use_padding)
+    if(info.use_padding)
     {
         AccessWindowStatic     input_access(input, 0, -border_size.top, ceil_to_multiple(input->tensor_shape()[0], num_elems_processed_per_iteration), input->tensor_shape()[1]);
         AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
         window_changed = update_window_and_padding(win, input_access, output_access);
-        output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(), policy, sampling_policy, border_undefined));
+        output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(), info.interpolation_policy, info.sampling_policy, info.border_mode == BorderMode::UNDEFINED));
     }
 
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
@@ -154,20 +144,20 @@
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
-                                                        InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy, BorderSize border_size, bool use_padding)
+                                                        const ScaleKernelInfo &info, BorderSize border_size)
 {
     std::pair<Status, Window> win_config;
     switch(input->data_layout())
     {
         case DataLayout::NCHW:
-            if(!use_padding)
+            if(!info.use_padding)
             {
                 return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Padding required for NCHW"), Window{});
             }
-            win_config = validate_and_configure_window_nchw(input, dx, dy, offsets, output, policy, border_undefined, sampling_policy, border_size);
+            win_config = validate_and_configure_window_nchw(input, dx, dy, offsets, output, info, border_size);
             break;
         case DataLayout::NHWC:
-            win_config = validate_and_configure_window_nhwc(input, output, policy, border_undefined, sampling_policy, border_size, use_padding);
+            win_config = validate_and_configure_window_nhwc(input, output, info, border_size);
             break;
         default:
             win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{});
@@ -178,7 +168,7 @@
 
 template <typename T>
 inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets, ITensor *output,
-                                    float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, float sampling_offset)
+                                    float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, float sampling_offset, bool align_corners)
 {
     const int  window_step_x  = 16 / sizeof(T);
     const auto window_start_x = static_cast<int32_t>(window.x().start());
@@ -194,7 +184,7 @@
     execute_window_loop(window, [&](const Coordinates & id)
     {
         const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-        const int     in_yi      = std::floor((id.z() + sampling_offset) * hr);
+        const auto    in_yi      = static_cast<int>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
         const int     offset_row = in_yi * stride_h;
         int32_t       x          = window_start_x;
         for(; x < window_end_x - window_step_x; x += window_step_x)
@@ -353,8 +343,7 @@
 }
 
 void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets,
-                              ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy,
-                              bool use_padding, bool align_corners)
+                              ITensor *output, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     // Perform validation step
@@ -363,7 +352,7 @@
                                                   dy != nullptr ? dy->info() : nullptr,
                                                   offsets != nullptr ? offsets->info() : nullptr,
                                                   output->info(),
-                                                  policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners));
+                                                  info));
 
     // Get data layout and width/height indices
     const DataLayout data_layout = input->info()->data_layout();
@@ -375,38 +364,33 @@
     _offsets               = offsets;
     _dx                    = dx;
     _dy                    = dy;
-    _policy                = policy;
+    _policy                = info.interpolation_policy;
     _border_size           = BorderSize(1);
-    _border_mode           = border_mode;
-    _constant_border_value = constant_border_value;
-    _use_padding           = use_padding;
-    _align_corners         = _policy == InterpolationPolicy::BILINEAR
-                             && sampling_policy == SamplingPolicy::TOP_LEFT
-                             && align_corners;
+    _border_mode           = info.border_mode;
+    _constant_border_value = info.constant_border_value;
+    _use_padding           = info.use_padding;
+    _align_corners         = info.align_corners;
 
-    if(sampling_policy == SamplingPolicy::CENTER)
+    if(info.sampling_policy == SamplingPolicy::CENTER)
     {
         _sampling_offset = 0.5f;
     }
 
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = arm_compute::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners);
-    const auto hr = arm_compute::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners);
 
     // Add constant border only on top in case of NHWC layout
     if(data_layout == DataLayout::NHWC)
     {
-        _border_size = (border_mode != BorderMode::REPLICATE && policy == InterpolationPolicy::BILINEAR && use_padding) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
+        _border_size = (info.border_mode != BorderMode::REPLICATE && info.interpolation_policy == InterpolationPolicy::BILINEAR && info.use_padding) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
     }
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
-    {
-        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
-    }
+    const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
 
     // Select interpolation function
-    switch(policy)
+    switch(policy_to_use)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
@@ -433,7 +417,7 @@
                                                                          dy != nullptr ? dy->info() : nullptr,
                                                                          offsets != nullptr ? offsets->info() : nullptr,
                                                                          output->info(),
-                                                                         policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size(), use_padding);
+                                                                         info, border_size());
 
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
@@ -444,7 +428,7 @@
     const size_t input_stride = _input->info()->strides_in_bytes()[1];
 
     // Compute the ratio between source height and destination height
-    const auto hr = arm_compute::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
 
     // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -477,8 +461,8 @@
                 const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
                 const uint8_t *const in_ptr      = in.ptr();
 
-                const int in_yi         = std::floor((id.y() + _sampling_offset) * hr);
-                const int in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
+                const auto in_yi         = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+                const int  in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
                 ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast<int>(_input->info()->dimension(1)));
                 const int offset_row = in_yi_clamped * input_stride;
 
@@ -514,8 +498,8 @@
                 const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
                 const uint8_t *const in_ptr      = in.ptr();
 
-                const int in_yi         = std::floor((id.y() + _sampling_offset) * hr);
-                const int in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
+                const auto in_yi         = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+                const int  in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
                 ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast<int>(_input->info()->dimension(1)));
                 const int offset_row = in_yi_clamped * input_stride;
 
@@ -554,9 +538,8 @@
             execute_window_loop(window, [&](const Coordinates & id)
             {
                 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr);
-                const int offset_row = in_yi * input_stride;
+                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+                const int  offset_row  = in_yi * input_stride;
 
                 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
                 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
@@ -595,9 +578,8 @@
             execute_window_loop(window, [&](const Coordinates & id)
             {
                 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr);
-                const int offset_row = in_yi * input_stride;
+                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+                const int  offset_row  = in_yi * input_stride;
 
                 tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
                 tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
@@ -638,9 +620,8 @@
             execute_window_loop(window, [&](const Coordinates & id)
             {
                 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr);
-                const int offset_row = in_yi * input_stride;
+                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+                const int  offset_row  = in_yi * input_stride;
 
                 tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
                 tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
@@ -677,7 +658,7 @@
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::F32);
 
     // Compute the ratio between source height and destination height
-    const auto hr = arm_compute::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
 
     // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -978,8 +959,8 @@
     Iterator in(_input, win_in);
     Iterator out(_output, window);
 
-    const auto   wr        = arm_compute::calculate_resize_ratio(_input->info()->dimension(0), _output->info()->dimension(0), _align_corners);
-    const auto   hr        = arm_compute::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    const auto   wr        = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(0), _output->info()->dimension(0), _align_corners);
+    const auto   hr        = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
     const auto   w         = _input->info()->dimension(0);
     const auto   h         = _input->info()->dimension(1);
     const size_t in_stride = _input->info()->strides_in_bytes()[1];
@@ -1026,7 +1007,7 @@
     const size_t input_stride_c = _input->info()->strides_in_bytes()[idx_channels];
 
     // Compute the ratio between source height and destination height
-    const auto hr = arm_compute::calculate_resize_ratio(_input->info()->dimension(idx_height), _output->info()->dimension(idx_height), _align_corners);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(idx_height), _output->info()->dimension(idx_height), _align_corners);
 
     // Don't increment in width/height/channels for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -1041,7 +1022,7 @@
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<int8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
+                scale_nearest_nhwc_core<int8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
             }
             else
             {
@@ -1055,7 +1036,7 @@
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<uint8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
+                scale_nearest_nhwc_core<uint8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
             }
             else
             {
@@ -1068,7 +1049,7 @@
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<int16_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
+                scale_nearest_nhwc_core<int16_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
             }
             else
             {
@@ -1083,7 +1064,7 @@
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
                 scale_nearest_nhwc_core<float16_t>(_input, _offsets, _output, hr,
-                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
+                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
             }
             else
             {
@@ -1097,7 +1078,7 @@
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<float>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
+                scale_nearest_nhwc_core<float>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
             }
             else
             {
@@ -1113,22 +1094,21 @@
 }
 
 Status NEScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
-                               const ITensorInfo *offsets, ITensorInfo *output, InterpolationPolicy policy,
-                               BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
+                               const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
 {
     BorderSize border_size(1);
     if(input->data_layout() == DataLayout::NHWC)
     {
-        border_size = (border_mode == BorderMode::CONSTANT && policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
+        border_size = (info.border_mode == BorderMode::CONSTANT && info.interpolation_policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
                                                               dx != nullptr ? dx->clone().get() : nullptr,
                                                               dy != nullptr ? dy->clone().get() : nullptr,
                                                               offsets != nullptr ? offsets->clone().get() : nullptr,
                                                               output->clone().get(),
-                                                              policy, border_mode == BorderMode::UNDEFINED, sampling_policy, border_size, use_padding)
+                                                              info, border_size)
                                 .first);
 
     return Status{};

diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
index 3add699..dcc9362 100644
--- a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index ed90de8..86e8233 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
index 7a27203..eb9d3c3 100644
--- a/src/core/NEON/kernels/NESobel3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
index a92cfc2..fc8ccc8 100644
--- a/src/core/NEON/kernels/NESobel5x5Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
index f2b42cc..95ab12b 100644
--- a/src/core/NEON/kernels/NESobel7x7Kernel.cpp
+++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
index 41bf03a..bc5b0c0 100644
--- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -368,6 +368,10 @@
             {
                 sum_inversed = 256.f / sum;
             }
+            else
+            {
+                sum = std::log(sum);
+            }
         }
 
         /* Normalize exponentials */
@@ -516,6 +520,10 @@
             {
                 sum_inversed = T(1) / sum;
             }
+            else
+            {
+                sum = static_cast<T>(std::log(sum));
+            }
         }
 
         /* Normalize exponentials */

diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index 88ea44a..e2fe88c 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 4087d8c..b342cd2 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 7ed6bb5..1d44be6 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index 15f786a..243a60f 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,7 +71,7 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
                                                         const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                                         int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
@@ -127,22 +127,20 @@
 } // namespace
 
 NEStridedSliceKernel::NEStridedSliceKernel()
-    : _input(nullptr), _output(nullptr), _starts_abs(), _final_strides(), _shrink_mask()
+    : _starts_abs(), _final_strides(), _shrink_mask()
 {
 }
 
-void NEStridedSliceKernel::configure(const ITensor *input, ITensor *output,
+void NEStridedSliceKernel::configure(const ITensorInfo *input, ITensorInfo *output,
                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
-    _input       = input;
-    _output      = output;
     _shrink_mask = shrink_axis_mask;
 
-    const TensorShape &input_shape = input->info()->tensor_shape();
+    const TensorShape &input_shape = input->tensor_shape();
 
     Coordinates ends_abs;
     std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
@@ -151,7 +149,7 @@
                                                           begin_mask, end_mask, shrink_axis_mask);
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
@@ -168,13 +166,15 @@
     return Status{};
 }
 
-void NEStridedSliceKernel::run(const Window &window, const ThreadInfo &info)
+void NEStridedSliceKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
     // Dispatch kernel
-    strided_slice_generic(_input, _output, _starts_abs, _final_strides, _shrink_mask, window);
+    strided_slice_generic(tensors.get_const_tensor(TensorType::ACL_SRC_0),
+                          tensors.get_tensor(TensorType::ACL_DST),
+                          _starts_abs, _final_strides, _shrink_mask, window);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp
index 536c220..d26a0ee 100644
--- a/src/core/NEON/kernels/NETableLookupKernel.cpp
+++ b/src/core/NEON/kernels/NETableLookupKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
index 5c3b2a7..6b291fd 100644
--- a/src/core/NEON/kernels/NEThresholdKernel.cpp
+++ b/src/core/NEON/kernels/NEThresholdKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,30 +28,60 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 
-#include <arm_neon.h>
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
-class Coordinates;
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ThresholdKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+    // Checks performed when output is configured
+    if((output != nullptr) && (output->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, *input->clone());
+
+    // NEThresholdKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
+} // namespace
 
 NEThresholdKernel::NEThresholdKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _threshold(0), _false_value(0), _true_value(0), _upper(0)
+    : _func(nullptr), _input(nullptr), _output(nullptr), _info()
 {
 }
 
-void NEThresholdKernel::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+void NEThresholdKernel::configure(const ITensor *input, ITensor *output, const ThresholdKernelInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), info));
 
-    _input       = input;
-    _output      = output;
-    _threshold   = threshold;
-    _false_value = false_value;
-    _true_value  = true_value;
-    _upper       = upper;
+    _input  = input;
+    _output = output;
+    _info   = info;
 
-    switch(type)
+    switch(_info.type)
     {
         case ThresholdType::BINARY:
             _func = &NEThresholdKernel::run_binary;
@@ -64,54 +94,111 @@
             break;
     }
 
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICPPKernel::configure(win_config.second);
+}
 
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
-    output_access.set_valid_region(win, input->info()->valid_region());
+Status NEThresholdKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ThresholdKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
-    INEKernel::configure(win);
+    return Status{};
 }
 
 inline void NEThresholdKernel::run_binary(const Window &window)
 {
-    const uint8x16_t threshold   = vdupq_n_u8(_threshold);
-    const uint8x16_t true_value  = vdupq_n_u8(_true_value);
-    const uint8x16_t false_value = vdupq_n_u8(_false_value);
+    /** NEON vector tag type. */
+    using Type         = uint8_t;
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<Type, wrapper::traits::BitWidth::W128>;
 
-    Iterator input(_input, window);
-    Iterator output(_output, window);
+    const int  window_step_x  = 16 / sizeof(Type);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
 
-    execute_window_loop(window, [&](const Coordinates &)
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const uint8_t threshold   = _info.threshold;
+    const uint8_t true_value  = _info.true_value;
+    const uint8_t false_value = _info.false_value;
+
+    const auto vthreshold   = wrapper::vdup_n(threshold, ExactTagType{});
+    const auto vtrue_value  = wrapper::vdup_n(true_value, ExactTagType{});
+    const auto vfalse_value = wrapper::vdup_n(false_value, ExactTagType{});
+
+    Iterator input(_input, win_collapsed);
+    Iterator output(_output, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
-        const uint8x16_t data = vld1q_u8(input.ptr());
-        const uint8x16_t mask = vcgtq_u8(data, threshold);
+        const auto input_ptr  = reinterpret_cast<const Type *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<Type *>(output.ptr());
 
-        vst1q_u8(output.ptr(), vbslq_u8(mask, true_value, false_value));
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vdata = wrapper::vloadq(input_ptr + x);
+            const auto vmask = wrapper::vcgt(vdata, vthreshold);
+            wrapper::vstore(output_ptr + x, wrapper::vbsl(vmask, vtrue_value, vfalse_value));
+        }
+
+        for(; x < window_end_x; ++x)
+        {
+            const Type data   = *(reinterpret_cast<const Type *>(input_ptr + x));
+            *(output_ptr + x) = (data > threshold) ? true_value : false_value;
+        }
     },
     input, output);
 }
 
 inline void NEThresholdKernel::run_range(const Window &window)
 {
-    const uint8x16_t lower_threshold = vdupq_n_u8(_threshold);
-    const uint8x16_t upper_threshold = vdupq_n_u8(_upper);
-    const uint8x16_t true_value      = vdupq_n_u8(_true_value);
-    const uint8x16_t false_value     = vdupq_n_u8(_false_value);
+    /** NEON vector tag type. */
+    using Type         = uint8_t;
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<Type, wrapper::traits::BitWidth::W128>;
 
-    Iterator input(_input, window);
-    Iterator output(_output, window);
+    const int  window_step_x  = 16 / sizeof(Type);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
 
-    execute_window_loop(window, [&](const Coordinates &)
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const uint8_t lower_threshold = _info.threshold;
+    const uint8_t upper_threshold = _info.upper;
+    const uint8_t true_value      = _info.true_value;
+    const uint8_t false_value     = _info.false_value;
+
+    const auto vlower_threshold = wrapper::vdup_n(lower_threshold, ExactTagType{});
+    const auto vupper_threshold = wrapper::vdup_n(upper_threshold, ExactTagType{});
+    const auto vtrue_value      = wrapper::vdup_n(true_value, ExactTagType{});
+    const auto vfalse_value     = wrapper::vdup_n(false_value, ExactTagType{});
+
+    Iterator input(_input, win_collapsed);
+    Iterator output(_output, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
-        const uint8x16_t data = vld1q_u8(input.ptr());
+        const auto input_ptr  = reinterpret_cast<const Type *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<Type *>(output.ptr());
 
-        uint8x16_t mask = vcleq_u8(data, upper_threshold);
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vdata = wrapper::vloadq(input_ptr + x);
+            auto       vmask = wrapper::vcle(vdata, vupper_threshold);
+            vmask            = wrapper::vand(wrapper::vcge(vdata, vlower_threshold), vmask);
+            wrapper::vstore(output_ptr + x, wrapper::vbsl(vmask, vtrue_value, vfalse_value));
+        }
 
-        mask = vandq_u8(vcgeq_u8(data, lower_threshold), mask);
-
-        vst1q_u8(output.ptr(), vbslq_u8(mask, true_value, false_value));
+        for(; x < window_end_x; ++x)
+        {
+            const Type data   = *(reinterpret_cast<const Type *>(input_ptr + x));
+            *(output_ptr + x) = (data <= upper_threshold && data >= lower_threshold) ? true_value : false_value;
+        }
     },
     input, output);
 }

diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
index 98f66e8..cc7655a 100644
--- a/src/core/NEON/kernels/NETileKernel.cpp
+++ b/src/core/NEON/kernels/NETileKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp
index 2951a16..7118e45 100644
--- a/src/core/NEON/kernels/NETransposeKernel.cpp
+++ b/src/core/NEON/kernels/NETransposeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
index 3b6faea..02cf133 100644
--- a/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEUpsampleLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,11 +27,11 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -50,48 +50,9 @@
     }
     return out;
 }
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, int num_elems_processed_per_iteration_x, const Size2D &info)
-{
-    std::pair<Status, Window> win_config;
-    switch(input->data_layout())
-    {
-        case DataLayout::NCHW:
-        {
-            const int              num_elems_processed_per_iteration_x_out = num_elems_processed_per_iteration_x * info.x();
-            Window                 win                                     = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x_out));
-            AccessWindowRectangle  input_access(input, 0, 0, num_elems_processed_per_iteration_x, 1, 0.5f, 0.5f);
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x_out);
-            bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-            output_access.set_valid_region(win, output->valid_region());
-
-            Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-            win_config = std::make_pair(err, win);
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            Window                 win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));
-            AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration_x);
-            AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);
-            bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-            output_access.set_valid_region(win, output->valid_region());
-
-            Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-            win_config = std::make_pair(err, win);
-            break;
-        }
-        default:
-        {
-            win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{});
-        }
-    }
-
-    return win_config;
-}
 } // namespace
 NEUpsampleLayerKernel::NEUpsampleLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _info(), _num_elems_processed_per_iteration_x()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _info()
 {
 }
 
@@ -118,11 +79,6 @@
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_height) != info.y() * input->dimension(idx_height));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
-
-    const int num_elems_processed_per_iteration_x = 16 / input->element_size();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              output->clone().get(), num_elems_processed_per_iteration_x, info)
-                                .first);
     return Status{};
 }
 
@@ -132,26 +88,46 @@
     using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
 
     Window window_in(window);
-    window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_x));
+    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
     window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.y()));
 
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = S;
+
     Iterator  input(_input, window_in);
     Iterator  output(_output, window_out);
     const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(T);
 
     execute_window_loop(window_out, [&](const Coordinates &)
     {
-        const VectorType data      = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr()));
-        const VectorType data_out1 = get_data_out<VectorType, S>(data, 0);
-        const VectorType data_out2 = get_data_out<VectorType, S>(data, S / 2);
-        auto              out       = reinterpret_cast<T *>(output.ptr());
+        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
 
-        wrapper::vstore(out, data_out1);
-        wrapper::vstore(out + S, data_out2);
-        wrapper::vstore(out + offset_y_out, data_out1);
-        wrapper::vstore(out + offset_y_out + S, data_out2);
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const VectorType data      = wrapper::vloadq(reinterpret_cast<const T *>(input_ptr + x));
+            const VectorType data_out1 = get_data_out<VectorType, S>(data, 0);
+            const VectorType data_out2 = get_data_out<VectorType, S>(data, S / 2);
+
+            wrapper::vstore(output_ptr + 2 * x, data_out1);
+            wrapper::vstore(output_ptr + 2 * x + S, data_out2);
+            wrapper::vstore(output_ptr + 2 * x + offset_y_out, data_out1);
+            wrapper::vstore(output_ptr + 2 * x + offset_y_out + S, data_out2);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            *(output_ptr + 2 * x)                    = *(input_ptr + x);
+            *(output_ptr + 2 * x + 1)                = *(input_ptr + x);
+            *(output_ptr + 2 * x + offset_y_out)     = *(input_ptr + x);
+            *(output_ptr + 2 * x + offset_y_out + 1) = *(input_ptr + x);
+        }
     },
     input, output);
 }
@@ -162,23 +138,47 @@
     using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
 
     Window window_out(window);
+    window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
     window_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), _info.x()));
     window_out.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(2), _info.y()));
 
-    Iterator input(_input, window);
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+    const int  window_step_x  = S;
+
+    Window window_in{ window };
+    window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, window_in);
     Iterator output(_output, window_out);
 
     const int offset_y_out = _output->info()->strides_in_bytes().y() / sizeof(T);
     const int offset_z_out = _output->info()->strides_in_bytes().z() / sizeof(T);
+
     execute_window_loop(window_out, [&](const Coordinates &)
     {
-        const VectorType data = wrapper::vloadq(reinterpret_cast<const T *>(input.ptr()));
-        auto              out  = reinterpret_cast<T *>(output.ptr());
+        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
 
-        wrapper::vstore(out, data);
-        wrapper::vstore(out + offset_y_out, data);
-        wrapper::vstore(out + offset_z_out, data);
-        wrapper::vstore(out + offset_y_out + offset_z_out, data);
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const VectorType data = wrapper::vloadq(reinterpret_cast<const T *>(input_ptr + x));
+
+            wrapper::vstore(output_ptr + x, data);
+            wrapper::vstore(output_ptr + x + offset_y_out, data);
+            wrapper::vstore(output_ptr + x + offset_z_out, data);
+            wrapper::vstore(output_ptr + x + offset_y_out + offset_z_out, data);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            *(output_ptr + x)                               = *(input_ptr + x);
+            *(output_ptr + x + offset_y_out)                = *(input_ptr + x);
+            *(output_ptr + x + offset_z_out)                = *(input_ptr + x);
+            *(output_ptr + x + offset_y_out + offset_z_out) = *(input_ptr + x);
+        }
     },
     input, output);
 }
@@ -201,8 +201,6 @@
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(NEUpsampleLayerKernel::validate(input->info(), output->info(), info, policy));
 
-    _num_elems_processed_per_iteration_x = 16 / output->info()->element_size();
-
     switch(data_layout)
     {
         case DataLayout::NCHW:
@@ -257,12 +255,11 @@
     }
 
     // Configure window
-    std::pair<Status, Window> win_config = validate_and_configure_window(input->info(),
-                                                                         output->info(),
-                                                                         _num_elems_processed_per_iteration_x,
-                                                                         info);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    Window      win = calculate_max_window(*input->info(), Steps());
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    INEKernel::configure(win);
 }
 
 void NEUpsampleLayerKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
index d04bc07..d8191dc 100644
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ b/src/core/NEON/kernels/NEWarpKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index d376d53..6a74914 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,15 +23,11 @@
  */
 #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 
-#include "arm_compute/core/Dimensions.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
@@ -48,11 +44,9 @@
 
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
-                                                         DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     if(biases != nullptr)
     {
@@ -179,3 +173,4 @@
     },
     in);
 }
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
index 711c3fe..171f596 100644
--- a/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,24 +58,22 @@
 } // namespace
 
 NEWidthConcatenateLayerKernel::NEWidthConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _width_offset(0)
+    : _width_offset(0)
 {
 }
 
-void NEWidthConcatenateLayerKernel::configure(const ITensor *input, unsigned int width_offset, ITensor *output)
+void NEWidthConcatenateLayerKernel::configure(const ITensorInfo *input, unsigned int width_offset, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), width_offset, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, width_offset, output));
 
-    _input        = input;
-    _output       = output;
     _width_offset = width_offset;
 
     // Configure kernel window
-    Window      win = calculate_max_window(*input->info(), Steps());
+    Window      win = calculate_max_window(*input, Steps());
     Coordinates coord;
-    coord.set_num_dimensions(output->info()->num_dimensions());
-    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
 
     INEKernel::configure(win);
 }
@@ -86,28 +84,31 @@
     return Status{};
 }
 
-void NEWidthConcatenateLayerKernel::run(const Window &window, const ThreadInfo &info)
+void NEWidthConcatenateLayerKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+
     // Offset output pointer to the correct position
-    uint8_t *output_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + _width_offset * _output->info()->strides_in_bytes()[0];
+    uint8_t *output_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0];
 
     const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(_output->info()->element_size());
+    const auto    window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
     constexpr int window_step_x  = 16;
 
     Window win{ window };
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     // Create iterators
-    Iterator                       input(_input, win);
-    Iterator                       output(_output, win);
-    const DataType                 dt           = _input->info()->data_type();
-    const UniformQuantizationInfo &input_qinfo  = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &output_qinfo = _output->info()->quantization_info().uniform();
+    Iterator                       input(src, win);
+    Iterator                       output(dst, win);
+    const DataType                 dt           = src->info()->data_type();
+    const UniformQuantizationInfo &input_qinfo  = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &output_qinfo = dst->info()->quantization_info().uniform();
     if(dt == DataType::QASYMM8 && input_qinfo != output_qinfo)
     {
         execute_window_loop(win, [&](const Coordinates &)

diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
index 3100bf7..bfe97bf 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
+#include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
@@ -35,6 +35,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/MemorySupport.h"
 
+#include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
+
 namespace arm_compute
 {
 //Batched Gemms

diff --git a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
similarity index 99%
rename from arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
rename to src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
index 1740df0..94df4f6 100644
--- a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,8 @@
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/NEON/kernels/convolution/common/convolution.hpp"
 #include "arm_compute/core/NEON/kernels/convolution/common/tensor.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
+
+#include "src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
 
 namespace arm_compute
 {

diff --git a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
index 5956118..b61633d 100644
--- a/src/core/NEON/kernels/NEYOLOLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEYOLOLayerKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/asmlib.hpp b/src/core/NEON/kernels/arm_gemm/asmlib.hpp
index 38f51ae..0067433 100644
--- a/src/core/NEON/kernels/arm_gemm/asmlib.hpp
+++ b/src/core/NEON/kernels/arm_gemm/asmlib.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/barrier.hpp b/src/core/NEON/kernels/arm_gemm/barrier.hpp
index cfd1079..8fbcddf 100644
--- a/src/core/NEON/kernels/arm_gemm/barrier.hpp
+++ b/src/core/NEON/kernels/arm_gemm/barrier.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/bfloat.hpp b/src/core/NEON/kernels/arm_gemm/bfloat.hpp
index 547c668..8d1ea03 100644
--- a/src/core/NEON/kernels/arm_gemm/bfloat.hpp
+++ b/src/core/NEON/kernels/arm_gemm/bfloat.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,5 +29,4 @@
 
 using bfloat16 = arm_compute::bfloat16;
 
-} // namespace arm_gemm
-
+} // namespace arm_gemm
\ No newline at end of file

diff --git a/src/core/NEON/kernels/arm_gemm/bias_adder.hpp b/src/core/NEON/kernels/arm_gemm/bias_adder.hpp
index 745d005..5d363fd 100644
--- a/src/core/NEON/kernels/arm_gemm/bias_adder.hpp
+++ b/src/core/NEON/kernels/arm_gemm/bias_adder.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp b/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
deleted file mode 100644
index 001cab7..0000000
--- a/src/core/NEON/kernels/arm_gemm/buffer_manager.hpp
+++ /dev/null

@@ -1,341 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <cstdlib>
-#include <vector>
-
-#ifndef NO_MULTI_THREADING
-#include <atomic>
-#include <mutex>
-
-#define USE_SEMAPHORE
-
-#ifdef USE_SEMAPHORE
-#include <condition_variable>
-#endif
-
-#endif
-
-namespace arm_gemm {
-
-#ifndef NO_MULTI_THREADING
-enum class BufferStatus {
-    IDLE,
-    POPULATING,
-    BUSY
-};
-
-class Buffer {
-private:
-    const int                _maxusers;    // Maximum permissible threads.
-    void * const             _storage;     // Storage for buffer content.
-
-    int                      _numusers;    // Actual number of threads (might be lower).
-
-    volatile BufferStatus    _status = BufferStatus::IDLE; // Status
-    std::atomic_int          _users = { };   // How many users are still using the buffer.
-    volatile int             _index = 0;   // Which block of data currently resides in the buffer.
-
-    std::mutex               _lock = { };
-#ifdef USE_SEMAPHORE
-    std::condition_variable  _cv = { };
-#endif
-
-    template <typename T>
-    void populate_buffer(T func) {
-        func(_storage);
-
-        /* Now mark it as ready. */
-#ifdef USE_SEMAPHORE
-        {
-            std::unique_lock<std::mutex> ul(_lock);
-            _status = BufferStatus::BUSY;
-            _cv.notify_all();
-        }
-#else
-        _status = BufferStatus::BUSY;
-#endif
-    }
-
-public:
-    Buffer(Buffer &) = delete;
-    Buffer &operator= (Buffer &) = delete;
-
-    Buffer(void *storage, int maxusers) : _maxusers(maxusers), _storage(storage), _numusers(maxusers) {
-        _status = BufferStatus::IDLE;
-    }
-
-    /* Try and populate the given index.
-     * Wait if the buffer is busy with previous index, then:
-     *
-     * If the buffer is idle, grab it and populate it.
-     * If it's already being populated by another thread or is ready, return.
-     */
-    template <typename T>
-    void try_populate(const int index, T func) {
-        for (;;) {
-#ifdef USE_SEMAPHORE
-            /* If it's busy with a previous index, wait on the semaphore. */
-            if ((_status == BufferStatus::BUSY) && (_index != index)) {
-                std::unique_lock<std::mutex> ul(_lock);
-
-                if ((_status == BufferStatus::BUSY) && (_index != index)) {
-                    _cv.wait(ul);
-                }
-            }
-#endif
-            /* Return if another thread is populating it already. */
-            if ((_index == index) &&
-                ((_status == BufferStatus::POPULATING) || (_status == BufferStatus::BUSY))) {
-                return;
-            }
-
-            if (_status == BufferStatus::IDLE) {
-                std::lock_guard<std::mutex> guard(_lock);
-
-                /* If the buffer is still idle, we can grab it and populate it. */
-                if (_status == BufferStatus::IDLE) {
-                    _status = BufferStatus::POPULATING;
-                    _index = index;
-                    _users = _numusers;
-                    break;
-                }
-            }
-        }
-
-        /* If we get here, fill in the buffer. */
-        populate_buffer(func);
-    }
-
-    template <typename T>
-    void *get(const int index, T func) {
-        // Loop until we achieve something.
-        for (;;) {
-            // If the index is correct and the buffer status is busy then we can
-            // just return the content.  No locking is needed here as the index
-            // cannot change (and status cannot change from BUSY) until all
-            // users have finished.
-            if ((_index == index) && (_status == BufferStatus::BUSY)) {
-                return _storage;
-            }
-
-            /* If the buffer still has some previous content, or is being
-             * populated, we can wait with the semaphore.  */
-#ifdef USE_SEMAPHORE
-            if (((_status == BufferStatus::BUSY) && (_index != index)) ||
-                 (_status == BufferStatus::POPULATING)) {
-                std::unique_lock<std::mutex> ul(_lock);
-
-                if (((_status == BufferStatus::BUSY) && (_index != index)) ||
-                     (_status == BufferStatus::POPULATING)) {
-                    _cv.wait(ul);
-                }
-            }
-#endif
-
-            // If it's idle, we need to populate it.  The IDLE->POPULATING
-            // transition requires the lock.
-            if (_status == BufferStatus::IDLE) {
-                std::lock_guard<std::mutex> guard(_lock);
-
-                /* If it's still idle, grab it.  Otherwise drop through and
-                 * we'll do something else next time through the loop.  */
-                if (_status == BufferStatus::IDLE) {
-                    _status = BufferStatus::POPULATING;
-                    _index = index;
-                    _users = _numusers;
-                    break;
-                }
-            }
-        }
-
-        /* If we get here we need to populate the buffer. */
-        populate_buffer(func);
-
-        return _storage;
-    }
-
-    /* Threads call this when they have finished processing a buffer.  We
-     * simply (atomically) decrement the user count, and if it's hit zero we
-     * flag the buffer as idle.
-     */
-    void release(void) {
-        if (--_users == 0) {
-#ifdef USE_SEMAPHORE
-            std::unique_lock<std::mutex> ul(_lock);
-            _status = BufferStatus::IDLE;
-            /* We notify all waiters as we expect one to do the populating
-             * and any others to go and process and earlier block.  */
-            _cv.notify_all();
-#else
-            _status = BufferStatus::IDLE;
-#endif
-        }
-    }
-
-    /* This is called to change the number of users. */
-    void set_numusers(int numusers) {
-        _numusers = std::min(numusers, _maxusers);
-    }
-};
-
-
-class BufferManager {
-private:
-    /* This has to be a vector of Buffer *, because a Buffer cannot be moved
-     * or copied due to atomic members. */
-    std::vector<Buffer *> _buffers = { };
-    const int _maxthreads;
-    void * const _storage;
-
-public:
-    BufferManager(BufferManager &) = delete;
-    BufferManager & operator=(BufferManager &) = delete;
-
-    // Say how much storage is needed.
-    static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize) {
-        return buffersize * ((maxthreads == 1) ? 1 : 3);
-    }
-
-    BufferManager(const int maxthreads, const size_t buffersize, void *storage) : _maxthreads(maxthreads), _storage(storage) {
-        const int numbuffers = (maxthreads == 1) ? 1 : 3;
-
-        /* We don't need any Buffer objects in single thread mode. */
-        if (_maxthreads == 1) {
-            return;
-        }
-
-        /* Use intptr_t to avoid performing arithmetic on a void * */
-        intptr_t storage_int = reinterpret_cast<intptr_t>(_storage);
-
-        for (int i=0; i<numbuffers; i++) {
-            _buffers.push_back(new Buffer(reinterpret_cast<void *>(storage_int), _maxthreads));
-            storage_int += buffersize;
-        }
-    }
-
-    ~BufferManager() {
-        while (_buffers.size()) {
-            delete _buffers.back();
-            _buffers.pop_back();
-        }
-    }
-
-    template <typename T>
-    void *get(const int index, T func) {
-        /* In single thread mode, we just directly call the populating
-         * function on the (single) buffer, otherwise forward to the
-         * relevant Buffer.  */
-        if (_maxthreads==1) {
-            func(_storage);
-            return _storage;
-        } else {
-            return _buffers[index % _buffers.size()]->get(index, func);
-        }
-    }
-
-    template <typename T>
-    void try_populate(const int index, T func) {
-        /* No need for this in single thread mode. */
-        if (_maxthreads==1) {
-            return;
-        }
-
-        _buffers[index % _buffers.size()]->try_populate(index, func);
-    }
-
-    void release(const int index) {
-        /* No need for this in single thread mode. */
-        if (_maxthreads==1) {
-            return;
-        }
-
-        _buffers[index % _buffers.size()]->release();
-    }
-
-    void set_nthreads(int threads) {
-        if (_maxthreads==1) {
-            return;
-        }
-
-        for(unsigned int i=0; i<_buffers.size(); i++) {
-            _buffers[i]->set_numusers(threads);
-        }
-    }
-};
-
-#else
-
-/* Trivial implementation if threading is disabled at compile time.
- *
- * Here, we only need storage for a single buffer.  The 'get' method needs
- * to call the supplied function to populate the buffer and then return it.
- * All the other methods do nothing.
- */
-
-class BufferManager {
-private:
-    void * const _storage;
-
-public:
-    BufferManager(BufferManager &) = delete;
-    BufferManager & operator=(BufferManager &) = delete;
-
-    BufferManager(const int maxthreads, const size_t buffersize, void *storage) : _storage(storage) {
-        UNUSED(maxthreads);
-        UNUSED(buffersize);
-    }
-
-    ~BufferManager() { }
-
-    // Say how much storage is needed.
-    static inline size_t get_storage_requirement(const int maxthreads, const size_t buffersize) {
-        UNUSED(maxthreads);
-        return buffersize;
-    }
-
-    template <typename T>
-    void try_populate(const int index, T func) {
-         UNUSED(index);
-         UNUSED(func);
-    }
-
-    void release(const int index) {
-         UNUSED(index);
-    }
-
-    template <typename T>
-    void *get(const int index, T func) {
-        UNUSED(index);
-        func(_storage);
-        return _storage;
-    }
-
-    void set_nthreads(int) { }
-};
-
-#endif
-
-} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index 10fee47..f3b6652 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,13 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_gemm.hpp"
+#include "bfloat.hpp"
 #include "gemm_common.hpp"
 #include "gemm_hybrid.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
-#include "gemm_native.hpp"
 #include "gemv_batched.hpp"
-#include "gemv_native_transposed.hpp"
 #include "gemv_pretransposed.hpp"
 
 #include "kernels/a64_interleaved_bf16fp32_dot_12x8.hpp"
@@ -37,17 +36,13 @@
 #include "kernels/a32_sgemm_8x6.hpp"
 #include "kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp"
 #include "kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp"
-#include "kernels/sve_native_bf16fp32_dot_4VLx4.hpp"
 #include "kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp"
 #include "kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp"
 #include "kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp"
 #include "kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp"
 
-#include "bfloat.hpp"
-
 namespace arm_gemm {
 
-
 static const GemmImplementation<bfloat16, float> gemm_bf16_methods[] =
 {
 #ifdef V8P6_BF
@@ -55,38 +50,31 @@
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_bf16fp32_mmla_6VLx2",
-    [](const GemmArgs &args) { return (args._Ksize>=8 && !args._trA && args._pretransposed_hint); },
+    [](const GemmArgs &args) { return (args._Ksize>=8); },
     [](const GemmArgs &args) { return ((args._Msize <= 4) && (args._Nsize <= hybrid_bf16fp32_mmla_6VLx2::out_width())); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_6VLx2, bfloat16, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_bf16fp32_mmla_8VLx2",
-    [](const GemmArgs &args) { return (args._Ksize>=8 && !args._trA && args._pretransposed_hint); },
+    [](const GemmArgs &args) { return (args._Ksize>=8); },
     [](const GemmArgs &args) { return (args._Msize <= 4); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_8VLx2, bfloat16, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_bf16fp32_mmla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize>=8 && !args._trA && args._pretransposed_hint); },
+    [](const GemmArgs &args) { return (args._Ksize>=8); },
     [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_4VLx4, bfloat16, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_bf16fp32_dot_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize>=8 && !args._trA && args._pretransposed_hint); },
+    [](const GemmArgs &args) { return (args._Ksize>=8); },
     [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_dot_4VLx4, bfloat16, float>(args); }
 },
-{ // gemm_bf16_native
-    GemmMethod::GEMM_NATIVE,
-    "native_bf16fp32_dot_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize>=8 && !args._trA && !args._trB); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
-    [](const GemmArgs &args) { return new GemmNative<native_bf16fp32_dot_4VLx4, bfloat16, float>(args); }
-},
 { // gemm_bf16_interleaved
     GemmMethod::GEMM_INTERLEAVED,
     "interleaved_bf16fp32_mmla_3VLx8",

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index b6671e8..9101221 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,14 +31,13 @@
 #include "gemm_hybrid.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
-#include "gemm_native.hpp"
+#include "gemm_interleaved_pretransposed_2d.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
 #include "kernels/a64_hgemm_24x8.hpp"
 #include "kernels/a64_sgemm_12x8.hpp"
 #include "kernels/sve_hybrid_fp16_mla_4VLx4.hpp"
 #include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
-#include "kernels/sve_native_fp16_mla_4VLx4.hpp"
 
 namespace arm_gemm {
 
@@ -47,18 +46,11 @@
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_fp16_mla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize >= 8) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return (args._Ksize >= 8); },
     [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp16_mla_4VLx4, __fp16, __fp16>(args); }
 },
 {
-    GemmMethod::GEMM_NATIVE,
-    "native_fp16_mla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize >= 8 && !args._trA && !args._trB); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmNative<native_fp16_mla_4VLx4, __fp16, __fp16>(args); }
-},
-{
     GemmMethod::GEMM_INTERLEAVED,
     "interleaved_fp16_mla_3VLx8",
     [](const GemmArgs &args) { return (args._Ksize > 4); },
@@ -69,8 +61,19 @@
 
 #if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
 {
+    GemmMethod::GEMM_INTERLEAVED_2D,
+    "hgemm_24x8_2d",
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    [](const GemmArgs &args) { return args._ci->has_fp16(); },
+#else
+    nullptr,
+#endif
+    [](const GemmArgs &args) { return args._maxthreads >= 8; },
+    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<hgemm_24x8, __fp16, __fp16>(args); }
+},
+{
     GemmMethod::GEMM_INTERLEAVED,
-    "hgemm_24x8",
+    "hgemm_24x8_1d",
 #ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     [](const GemmArgs &args) { return args._ci->has_fp16(); },
 #else
@@ -79,11 +82,21 @@
     nullptr,
     [](const GemmArgs &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
 },
+
 #endif // aarch64 && FP16
 #ifdef __aarch64__
+//Pretranpose, 2D split
+{
+    GemmMethod::GEMM_INTERLEAVED_2D,
+    "sgemm_12x8_2d",
+    nullptr,
+    [](const GemmArgs &args) { return args._maxthreads >= 8; },
+    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, __fp16, __fp16>(args); }
+},
+//Tranpose, 1D split, with blockmanager
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "sgemm_12x8",
+    "sgemm_12x8_1d",
     nullptr,
     nullptr,
     [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args); }

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index e3355ed..ddb438f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,26 +26,22 @@
 #include "gemm_hybrid.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
-#include "gemm_interleaved_2d.hpp"
 #include "gemm_interleaved_pretransposed_2d.hpp"
-#include "gemm_native.hpp"
 #include "gemv_batched.hpp"
-#include "gemv_native_transposed.hpp"
 #include "gemv_pretransposed.hpp"
 
 #include "kernels/a32_sgemm_8x6.hpp"
 #include "kernels/a64_hybrid_fp32_mla_16x4.hpp"
 #include "kernels/a64_hybrid_fp32_mla_4x8.hpp"
-#include "kernels/a64_native_fp32_mla_16x4.hpp"
 #include "kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp"
 #include "kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp"
 #include "kernels/a64_sgemm_12x8.hpp"
 #include "kernels/a64_sgemv_pretransposed.hpp"
-#include "kernels/a64_sgemv_trans.hpp"
 
 #include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp"
+#include "kernels/sve_hybrid_fp32_mmla_4VLx4.hpp"
 #include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
-#include "kernels/sve_native_fp32_mla_4VLx4.hpp"
+#include "kernels/sve_interleaved_fp32_mmla_3VLx8.hpp"
 #include "kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp"
 
 namespace arm_gemm {
@@ -63,79 +59,74 @@
 {
     GemmMethod::GEMV_PRETRANSPOSED,
     "sgemv_pretransposed",
-    [](const GemmArgs &args) { return (args._Msize==1 && args._pretransposed_hint && args._nbatches==1); },
+    [](const GemmArgs &args) { return (args._Msize==1 && args._nbatches==1); },
     nullptr,
     [](const GemmArgs &args) { return new GemvPretransposed<sgemv_pretransposed, float, float>(args); }
 },
+#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32)
 {
-    GemmMethod::GEMV_NATIVE_TRANSPOSED,
-    "sgemv_trans",
-    [](const GemmArgs &args) { return (args._Msize==1 && !args._trA && !args._trB && args._nbatches==1); },
-    nullptr,
-    [](const GemmArgs &args) { return new GemvNativeTransposed<sgemv_trans, float, float>(args); }
+    GemmMethod::GEMM_HYBRID,
+    "hybrid_fp32_mmla_4VLx4",
+    [](const GemmArgs &args) { return (args._Ksize >= 4); },
+    [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+    [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mmla_4VLx4, float, float>(args); }
 },
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "interleaved_fp32_mmla_3VLx8",
+    [](const GemmArgs &args) { return (args._Ksize>4); },
+    nullptr,
+    [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mmla_3VLx8, float, float>(args); }
+},
+#endif // __ARM_FEATURE_SVE && MMLA_FP32
 
 #ifdef __ARM_FEATURE_SVE
-// SVE smallk / native / hybrid methods
+// SVE smallk /  hybrid methods
 {
     GemmMethod::GEMM_HYBRID,
     "smallK_hybrid_fp32_mla_1VLx8",
-    [](const GemmArgs &args) { return (args._Ksize <= 24) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return (args._Ksize <= 24); },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_1VLx8, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_fp32_mla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize >= 4) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return (args._Ksize >= 4); },
     [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_4VLx4, float, float>(args); }
 },
-{
-    GemmMethod::GEMM_NATIVE,
-    "native_fp32_mla_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize>4 && !args._trA && !args._trB); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmNative<native_fp32_mla_4VLx4, float, float>(args); }
-},
 #endif // __ARM_FEATURE_SVE
 
-// NEON native / hybrid methods
+// NEON hybrid methods
 {
     GemmMethod::GEMM_HYBRID,
     "smallK_hybrid_fp32_mla_4x8",
-    [](const GemmArgs &args) { return (args._Ksize <= 8) && (args._Nsize % 4)==0 && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return (args._Ksize <= 8) && (args._Nsize % 4)==0; },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_4x8, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "smallK_hybrid_fp32_mla_4x6",
-    [](const GemmArgs &args) { return (args._Ksize > 8) && (args._Ksize <= 16) && (args._Nsize % 4)==0 && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return (args._Ksize > 8) && (args._Ksize <= 16) && (args._Nsize % 4)==0; },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_4x6, float, float>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_fp32_mla_4x8_normal",
-    [](const GemmArgs &args) { return (args._Ksize >= 4) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return (args._Ksize >= 4); },
     [](const GemmArgs &args) { return (args._Nsize < 12); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_4x8, float, float>(args); }
 },
-{
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_HYBRID,
-    "hybrid_fp32_mla_16x4_normal",
-    [](const GemmArgs &args) { return (args._Ksize >= 4) && !args._trA && args._pretransposed_hint; },
-    [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || (args._Msize < 16) || (args._nmulti > 1); },
+    "hybrid_fp32_mla_16x4",
+    [](const GemmArgs &args) { return (args._Ksize >= 4); },
+    [](const GemmArgs &args) { return GemmHybrid<hybrid_fp32_mla_16x4, float, float>::estimate_cycles(args, hybrid_fp32_mla_16x4::get_performance_parameters(args._ci)); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
-},
-{
-    GemmMethod::GEMM_NATIVE,
-    "native_fp32_mla_16x4",
-    [](const GemmArgs &args) { return (args._Ksize>4 && (args._Nsize % 16)==0 && !args._trA && !args._trB); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
-    [](const GemmArgs &args) { return new GemmNative<native_fp32_mla_16x4, float, float>(args); }
-},
+),
 
 #ifdef __ARM_FEATURE_SVE
 {
@@ -146,31 +137,22 @@
     [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); }
 },
 #endif // __ARM_FEATURE_SVE
-//Pretranpose, 2D split
-{
+// Pretranposed, 2D split
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED_2D,
-    "sgemm_12x8",
-    [](const GemmArgs &args) { return args._pretransposed_hint; },
-    [](const GemmArgs &args) { return args._pretransposed_hint; },
+    "sgemm_12x8_2d",
+    nullptr,
+    [](const GemmArgs &args) { return GemmInterleavedPretransposed2d<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
     [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); }
-},
-//Tranpose, 2D split, no blockmanager
-{
-    GemmMethod::GEMM_INTERLEAVED_2D,
-    "sgemm_12x8",
-    [](const GemmArgs &args) { return (!args._pretransposed_hint) && args._maxthreads >= 8; },
-    [](const GemmArgs &args) { return (!args._pretransposed_hint) && args._maxthreads >= 8; },
-    [](const GemmArgs &args) { return new GemmInterleaved2d<sgemm_12x8, float, float>(args); }
-},
-//Tranpose, 1D split, with blockmanager
-{
+),
+// 1D split (with pretransposed or not)
+GemmImplementation<float, float>::with_estimate(
     GemmMethod::GEMM_INTERLEAVED,
-    "sgemm_12x8",
-    [](const GemmArgs &args) { return (!args._pretransposed_hint); },
-    [](const GemmArgs &args) { return (!args._pretransposed_hint); },
+    "sgemm_12x8_1d",
+    nullptr,
+    [](const GemmArgs &args) { return GemmInterleaved<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
     [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
-},
-
+),
 #endif // __aarch64__
 
 #ifdef __arm__

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index aeeed26..7a983ed 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,18 +23,15 @@
  */
 #pragma once
 
-#include <assert.h>
-
 #include <algorithm>
+#include <cassert>
 
 #include "arm_gemm.hpp"
 #include "bias_adder.hpp"
-#include "utils.hpp"
-
-#include "arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp"
-
-#include "mergeresults.hpp"
+#include "ndrange.hpp"
+#include "performance_parameters.hpp"
 #include "transform.hpp"
+#include "utils.hpp"
 
 #ifdef CYCLE_PROFILING
 #include "profiler.hpp"
@@ -58,8 +55,6 @@
     const unsigned int _nbatches;
     const unsigned int _nmulti;
 
-    const bool _trB;
-
     const Activation _act;
 
     /* Blocking info */
@@ -73,8 +68,8 @@
     const NDRange<4> _window_range;
 
     static unsigned int compute_k_block(const GemmArgs &args) {
-        // Some kernels don't support append mode - these can't do K blocking at all.
-        if (!strategy::supports_append()) {
+        // Some kernels don't support accumulate mode - these can't do K blocking at all.
+        if (!strategy::supports_accumulate()) {
             return args._Ksize;
         }
 
@@ -136,7 +131,7 @@
     /* Constructor */
     GemmHybrid(const GemmArgs &args)
               : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
-                _nbatches(args._nbatches), _nmulti(args._nmulti), _trB(args._trB),
+                _nbatches(args._nbatches), _nmulti(args._nmulti),
                 _act(args._act),
                 _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
                 _Mround(roundup(args._Msize, strategy::out_height())),
@@ -144,7 +139,7 @@
 
     // Interface implementation - Compulsory functions
     ndrange_t get_window_size() const override {
-        return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
+        return { _window_range.total_size() };
     }
 
     // This kernel can always be dynamically scheduled.
@@ -152,8 +147,8 @@
         return true;
     }
 
-    void execute_1d(unsigned int start, unsigned int end, int threadid) {
-        UNUSED(threadid);
+    // Execute
+    void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
@@ -174,7 +169,7 @@
             const bool first_pass = (k0 == 0);
             const bool last_pass = (kmax == _Ksize);
 
-            auto p = _window_range.iterator(start, end);
+            auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
 
             if (p.done()) {
                 return;
@@ -194,7 +189,7 @@
                                      (n0 * kern_k);
 
 #ifdef CYCLE_PROFILING
-                auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
+                auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
 #endif
 
                 strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
@@ -215,17 +210,6 @@
         }
     }
 
-    // Execute
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
-        UNUSED(thread_locator);
-
-        const auto start = work_range.get_position(0);
-        const auto size  = work_range.get_size(0);
-        const auto stop  = start + size;
-
-        execute_1d(start, stop, threadid);
-    }
-
     // Interface implementation - pretransposed
     bool B_is_pretransposed() const override {
         return true;
@@ -255,7 +239,7 @@
                     const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
 
                     strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
-                                               x0, xmax, k0, kmax, _trB);
+                                               x0, xmax, k0, kmax);
 
                     buffer += size;
                 }
@@ -266,6 +250,28 @@
     void set_pretransposed_B_data(void *in_buffer) override {
         _B_transposed = reinterpret_cast<Toi *>(in_buffer);
     }
+
+    // Estimate cycles for given problem given provided parameters
+    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+        // Note: Current hybrid kernels don't actually round up height (they
+        // have paths for each possible height).  Might need to make this
+        // configurable in future.
+        uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+
+        float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+
+        // TODO: A bit of a kludge here: current hybrid kernels incur extra
+        // overhead where the width is not a multiple of kernel width.  It's
+        // most noticable where the overall width is quite low, so add 15%
+        // penalty for such widths.
+        if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
+            mac_cycles *= 1.15f;
+        }
+
+        uint64_t total_cycles = mac_cycles;
+
+        return total_cycles;
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
index 6897e64..915227f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,10 +28,9 @@
 #include <algorithm>
 
 #include "arm_gemm.hpp"
+#include "ndrange.hpp"
 #include "utils.hpp"
 
-#include "arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp"
-
 #include "mergeresults.hpp"
 #include "transform.hpp"
 
@@ -57,8 +56,6 @@
     const unsigned int _nbatches;
     const unsigned int _nmulti;
 
-    const bool _trB;
-
     /* Blocking info */
     const unsigned int _k_block;
     const unsigned int _n_block;
@@ -143,7 +140,7 @@
     /* Constructor */
     GemmHybridQuantized(const GemmArgs &args, const Requantize32 &qp)
               : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
-                _nbatches(args._nbatches), _nmulti(args._nmulti), _trB(args._trB),
+                _nbatches(args._nbatches), _nmulti(args._nmulti),
                 _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
                 _Mround(roundup(args._Msize, strategy::out_height())),
                 _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti),
@@ -151,7 +148,7 @@
 
     // Interface implementation - Compulsory functions
     ndrange_t get_window_size() const override {
-        return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
+        return { _window_range.total_size() };
     }
 
     // This kernel can always be dynamically scheduled.
@@ -159,7 +156,8 @@
         return true;
     }
 
-    void execute_1d(unsigned int start, unsigned int end, int threadid) {
+    // Execute
+    void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
@@ -180,7 +178,7 @@
             unsigned int kmax   = std::min(k0 + _k_block, _Ksize);
             unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
 
-            auto p = _window_range.iterator(start, end);
+            auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
 
             if (p.done()) {
                 return;
@@ -228,23 +226,12 @@
 
                     requantize_block_32(_qp, (nmax - n0), (m_end - m_start), result_buffer, (nmax - n0),
                                         this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
-                                        local_row_sums, col_bias + (multi * _Nsize) + n0);
+                                        local_row_sums, col_bias + (multi * _Nsize) + n0, n0);
                 }
             } while (p.next_dim0());
         }
     }
 
-    // Execute
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
-        UNUSED(thread_locator);
-
-        const auto start = work_range.get_position(0);
-        const auto size  = work_range.get_size(0);
-        const auto stop  = start + size;
-
-        execute_1d(start, stop, threadid);
-    }
-
     // Working space needed for intermediate result buffers.
     size_t get_working_size() const override {
         return (_nthreads * strategy::out_height() * _Nsize * sizeof(Tri));
@@ -290,7 +277,7 @@
                     const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
 
                     strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
-                                               x0, xmax, k0, kmax, _trB);
+                                               x0, xmax, k0, kmax);
 
                     buffer += size;
                 }

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 569d1f4..261e7d2 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,8 +22,9 @@
  * SOFTWARE.
  */
 
-#include <arm_gemm.hpp>
+#include "arm_gemm.hpp"
 
+#include <cstdint>
 #include <functional>
 
 namespace arm_gemm {
@@ -37,7 +38,7 @@
     const GemmMethod                                                               method;
     const char *                                                                   name;
     std::function<bool(const GemmArgs &, const OutputStage &)>                     is_supported;
-    std::function<bool(const GemmArgs &, const OutputStage &)>                     is_recommended;
+    std::function<uint64_t(const GemmArgs &, const OutputStage &)>                 cycle_estimate;
     std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)>  instantiate;
 
     bool do_is_supported(const GemmArgs &args, const OutputStage &os) const {
@@ -48,17 +49,27 @@
         }
     }
 
-    bool do_is_recommended(const GemmArgs &args, const OutputStage &os) const {
-        if (is_recommended != nullptr) {
-            return is_recommended(args, os);
+    uint64_t do_cycle_estimate(const GemmArgs &args, const OutputStage &os) const {
+        if (cycle_estimate != nullptr) {
+            return cycle_estimate(args, os);
         } else {
-            return true;
+            return 0;
         }
     }
 
+    GemmImplementation(const GemmImplementation &) = default;
+    GemmImplementation &operator= (const GemmImplementation &) = default;
+
     GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const OutputStage &os) const {
         return instantiate(args, os);
     }
+
+    GemmImplementation(GemmMethod m, const char *n,
+                       std::function<bool(const GemmArgs &, const OutputStage &)> is_supported, std::function<bool(const GemmArgs &, const OutputStage &)> is_recommended,
+                       std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
+                       method(m), name(n), is_supported(is_supported),
+                       cycle_estimate( [is_recommended](const GemmArgs &args, const OutputStage &os) { return (is_recommended == nullptr) ? 0 : (is_recommended(args, os) ? 0 : UINT64_MAX); } ),
+                       instantiate(instantiate) {   }
 };
 
 /* Slightly different version of above for straightforward GEMMs with no
@@ -69,7 +80,7 @@
     const GemmMethod                                          method;
     const char *                                              name;
     std::function<bool(const GemmArgs &)>                     is_supported;
-    std::function<bool(const GemmArgs &)>                     is_recommended;
+    std::function<uint64_t(const GemmArgs &)>                 cycle_estimate;
     std::function<GemmCommon<Top, Tret> *(const GemmArgs &)>  instantiate;
 
     bool do_is_supported(const GemmArgs &args, const Nothing &) const {
@@ -80,17 +91,42 @@
         }
     }
 
-    bool do_is_recommended(const GemmArgs &args, const Nothing &) const {
-        if (is_recommended != nullptr) {
-            return is_recommended(args);
+    uint64_t do_cycle_estimate(const GemmArgs &args, const Nothing &) const {
+        if (cycle_estimate != nullptr) {
+            return cycle_estimate(args);
         } else {
-            return true;
+            return 0;
         }
     }
 
     GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const Nothing &) const {
         return instantiate(args);
     }
+
+
+    static GemmImplementation with_estimate(GemmMethod m, const char *n,
+                       std::function<bool(const GemmArgs &)> is_supported, std::function<uint64_t(const GemmArgs &)> cycle_estimate,
+                       std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) {
+        GemmImplementation impl(m,n);
+
+        impl.is_supported=is_supported;
+        impl.cycle_estimate=cycle_estimate;
+        impl.instantiate=instantiate;
+
+        return impl;
+    }
+
+    GemmImplementation(GemmMethod m, const char * n) : method(m), name(n), is_supported(nullptr), cycle_estimate(nullptr), instantiate(nullptr) {}
+
+    GemmImplementation(GemmMethod m, const char *n,
+                       std::function<bool(const GemmArgs &)> is_supported, std::function<bool(const GemmArgs &)> is_recommended,
+                       std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) :
+                       method(m), name(n), is_supported(is_supported),
+                       cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ),
+                       instantiate(instantiate) {   }
+
+    GemmImplementation(const GemmImplementation &) = default;
+    GemmImplementation &operator=(const GemmImplementation &) = default;
 };
 
 /* "Master" function implemented for each valid combination of types.
@@ -103,13 +139,11 @@
 /*
  * Select a GEMM implementation for the given arguments.
  *
- * The logic here returns the first method on the list which supports the
+ * The logic here returns the method on the list which supports the
  * requested problem parameters, matches the provided filters (method and/or
- * name string match) and recommends itself.
- *
- * If there is no such method, it will return the first method which
- * supports the requested parameters and passes the filters, regardless of
- * recommendation.
+ * name string match) and offers the lowest cycle estimate.  A cycle
+ * estimate of '0' is treated as a special value, causing the corresponding
+ * method to be selected immediately.
  *
  * If no method supports the requested parameters and passes the filters,
  * this function returns false and doesn't touch the provided pointer
@@ -121,6 +155,7 @@
     const GemmConfig *cfg = args._cfg;
 
     const GemmImplementation<Top, Tret, OutputStage> *saved_impl = nullptr;
+    uint64_t best_estimate = 0;
 
     for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
         /* Skip if this implementation doesn't support these args. */
@@ -138,27 +173,24 @@
             continue;
         }
 
-        /* At this point, if we don't have a saved implementation, save this
-         * one.  This is so that we always return something if a filter
-         * matches, even if it doesn't recommend itself.
-         */
-        if (saved_impl == nullptr) {
-            saved_impl=i;
+        /* Test the cycle estimate */
+        uint64_t estimate = i->do_cycle_estimate(args, os);
+
+        /* Short circuit - if the estimate is zero, return this one immediately. */
+        if (estimate==0) {
+            impl=i;
+            return true;
         }
 
-        /* Check that this method recommends itself. */
-        if (!i->do_is_recommended(args, os)) {
-            continue;
+        /* Otherwise, remember this is our best so far if we don't yet have
+         * a valid candidate, or we beat the estimate.  */
+        if ((saved_impl == nullptr) || (estimate < best_estimate)) {
+            saved_impl = i;
+            best_estimate = estimate;
         }
-
-        impl=i;
-
-        return true;
     }
 
-    /* We didn't find an option matching the filters that recommended
-     * itself.  But if we found something earlier that matched the filters
-     * but wasn't recommended, return it here.  */
+    /* Return whichever method gave the best estimate. */
     if (saved_impl != nullptr) {
         impl = saved_impl;
         return true;
@@ -183,7 +215,7 @@
             continue;
         }
 
-        res.push_back(KernelDescription(i->method, i->name, i==default_impl));
+        res.push_back(KernelDescription(i->method, i->name, i==default_impl, i->do_cycle_estimate(args, os)));
     }
 
     return res;

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index e3b4416..da68233 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index f7d8f65..bddcc8d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,7 @@
 #include "gemm_hybrid.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
-#include "gemm_native.hpp"
+#include "gemm_interleaved_pretransposed_2d.hpp"
 
 #include "kernels/a64_gemm_s16_12x8.hpp"
 #include "kernels/a64_gemm_s8_12x8.hpp"
@@ -40,14 +40,13 @@
 #include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
 #include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
 #include "kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp"
-#include "kernels/sve_native_s8s32_dot_4VLx4.hpp"
 #include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
 #ifdef __ARM_FEATURE_SVE
-#ifdef V8P6
+#ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
     "interleaved_s8s32_mmla_3VLx8",
@@ -59,25 +58,18 @@
 {
     GemmMethod::GEMM_HYBRID,
     "smallK_hybrid_s8s32_dot_1VLx8",
-    [](const GemmArgs &args) { return args._Ksize<=64 && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._Ksize<=64; },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_1VLx8, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_s8s32_dot_4VLx4",
-    [](const GemmArgs &args) { return args._Ksize>=16 && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._Ksize>=16; },
     [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
 },
 {
-    GemmMethod::GEMM_NATIVE,
-    "native_s8s32_dot_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize>=16 && !args._trA && !args._trB); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
-    [](const GemmArgs &args) { return new GemmNative<native_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
-},
-{
     GemmMethod::GEMM_INTERLEAVED,
     "interleaved_s8s32_dot_3VLx8",
     [](const GemmArgs &args) { return (args._Ksize>4); },
@@ -85,7 +77,7 @@
     [](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args); }
 },
 #endif
-#ifdef V8P6
+#ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
     "interleaved_s8s32_mmla_12x8",
@@ -97,34 +89,55 @@
 {
     GemmMethod::GEMM_HYBRID,
     "smallK_hybrid_s8s32_dot_4x8",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_4x8, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "smallK_hybrid_s8s32_dot_4x6",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_4x6, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_s8s32_dot_16x4",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; },
     [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
 },
 {
+    GemmMethod::GEMM_INTERLEAVED_2D,
+    "gemm_s8_12x8_2d",
+    [](const GemmArgs &args) { return args._ci->has_dotprod(); },
+    [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8); },
+    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_s8_12x8, int8_t, int32_t>(args); }
+},
+{
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_s8_12x8",
+    "gemm_s8_12x8_1d",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
     nullptr,
     [](const GemmArgs &args) { return new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args); }
 },
 {
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_s8_4x4",
+    "gemm_s16_12x8",
+    nullptr,
+    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; },
+    [](const GemmArgs &args) { return new GemmInterleaved<gemm_s16_12x8, int8_t, int32_t>(args); },
+},
+{
+    GemmMethod::GEMM_INTERLEAVED_2D,
+    "gemm_s8_4x4_2d",
+    nullptr,
+    [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8); },
+    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_s8_4x4, int8_t, int32_t>(args); }
+},
+{
+    GemmMethod::GEMM_INTERLEAVED,
+    "gemm_s8_4x4_1d",
     nullptr,
     nullptr,
     [](const GemmArgs &args) { return new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(args); }

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 4897bed..c4dceef 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,16 +23,14 @@
  */
 #pragma once
 
-#include <stdio.h>
-#include <assert.h>
-
 #include <algorithm>
+#include <cassert>
 
 #include "arm_gemm.hpp"
 #include "utils.hpp"
 
-#include "buffer_manager.hpp"
 #include "mergeresults.hpp"
+#include "performance_parameters.hpp"
 #include "transform.hpp"
 
 #ifdef CYCLE_PROFILING
@@ -65,14 +63,10 @@
     const unsigned int _nbatches;
     const unsigned int _nmulti;
 
-    const bool _trA;
-    const bool _trB;
-
     const Activation _act;
 
     const int _maxthreads;
     int _nthreads;
-    const bool _pretransposed;
 
     /* Blocking info */
     unsigned int _k_block=0;
@@ -81,7 +75,6 @@
 
     /* Working space, pretransposed buffer, buffer manager */
     const Toi *_B_transposed=nullptr;
-    BufferManager *_bm=nullptr;
     void *_working_space=nullptr;
 
     /* We will need to walk through the blocks of B in a few contexts, so
@@ -150,27 +143,106 @@
         return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
     }
 
-    // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
-    size_t get_b_working_size() const {
-        return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
-    }
-
     // C working size: One needed per thread.
     size_t get_c_working_size() const {
         return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
     }
 
-    // Internal execute function.
-    // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
-    template<bool pretransposed>
-    void execute_internal(unsigned int start, unsigned int end, int threadid) {
+    static unsigned int get_k_block_size(const GemmArgs &args) {
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        const unsigned int L1_size = args._ci->get_L1_cache_size();
+        unsigned int k_block;
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        k_block /= strategy::k_unroll();
+        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        unsigned int num_k_blocks = iceildiv(args._Ksize, k_block);
+
+        // So divide the space equally into that many blocks.
+        k_block = iceildiv(args._Ksize, num_k_blocks);
+
+        // And round UP to the K unroll level required.
+        k_block = roundup(k_block, strategy::k_unroll());
+
+        return k_block;
+    }
+
+public:
+    GemmInterleaved(GemmInterleaved &) = delete;
+    GemmInterleaved & operator= (GemmInterleaved &) = delete;
+
+    /* Constructor */
+    GemmInterleaved(const GemmArgs &args)
+                    : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+                      _nbatches(args._nbatches), _nmulti(args._nmulti),
+                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+                      _k_block(get_k_block_size(args)) {
+        const unsigned int L2_size = _ci->get_L2_cache_size();
+
+        assert(_maxthreads > 0);
+
+        // Work out blocking parameters, or override from provided GemmConfig
+        // TODO: Move outer block into a static function too.
+        if (args._cfg && args._cfg->outer_block_size) {
+            _x_block = args._cfg->outer_block_size;
+        } else {
+            // x_block: Work out how many rows (of length k_block) will fit in the L2
+            // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+            _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+                      (sizeof(Toi) * _k_block);
+
+            // Needs to be (at least a single) multiple of the kernel output width.
+            _x_block /= strategy::out_width();
+            _x_block = std::max(_x_block, 1U) * strategy::out_width();
+
+            // And tune to the presented problem size.
+            unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
+            _x_block = iceildiv(_Nsize, num_x_blocks);
+
+            _x_block = iceildiv(_x_block, strategy::out_width());
+            _x_block *= strategy::out_width();
+        }
+
+        // Work out the rounded size of M - needed for some buffers.
+        _Mround = iceildiv(_Msize, strategy::out_height());
+        _Mround *= strategy::out_height();
+    }
+
+    // Interface implementation - Compulsory functions
+
+    // Window size: Only the last thread should do a ragged block, so dole
+    // out work in units of out_height.  Factor batches into the window, but
+    // not multi for now (as this would cause problems with the buffer
+    // manager).
+    ndrange_t get_window_size() const override {
+        // _Mround is a multiple of out_height by definition.
+        return { (_Mround / strategy::out_height()) * _nbatches };
+    }
+
+    // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
+    void set_nthreads(int nthreads) override {
+        _nthreads = std::min(nthreads, _maxthreads);
+    }
+
+    // Execute
+    void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
+        const auto start = work_range.get_position(0);
+        const auto end   = work_range.get_position_end(0);
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
         strategy strat(_ci);
 
         blockwalker current(*this);
-        blockwalker next=current;
 
         /* Translate 'start' and 'end' into a position within the batches and rows. */
         const unsigned int window_per_batch = _Mround / strategy::out_height();
@@ -182,12 +254,7 @@
         unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
 
         /* Make sure we've been set up correctly. */
-        if (pretransposed) {
-            assert(_B_transposed);
-        } else {
-            assert(_bm);
-        }
-
+        assert(_B_transposed);
         assert(_working_space);
         int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
 
@@ -198,12 +265,8 @@
         Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
         Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
 
-        // Shared buffers - these come either from BufferManager or _B_transposed.
         const Toi *b_panel;
-
-        if (pretransposed) {
-            b_panel = _B_transposed;
-        }
+        b_panel = _B_transposed;
 
         //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
 
@@ -224,7 +287,7 @@
 
                     strat.transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * _k_block),
                                               this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
-                                              this->_lda, first_m, last_m, current.k0(), current.kmax(), _trA);
+                                              this->_lda, first_m, last_m, current.k0(), current.kmax());
                 }
 
                 // Figure out how many "K" the kernel will actually process.
@@ -234,41 +297,6 @@
 
             int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
 
-            if (!pretransposed) {
-                /* Look ahead to the next block and populate it if necessary.
-                 * This avoids the populate operation becoming a bottleneck, and
-                 * helps keep the threads synchronized (the first thread to get
-                 * here will populate while the rest will advance).
-                 *
-                 * If we are running single threaded, bm->try_populate() will do
-                 * nothing.
-                 */
-                if (next.advance()) {
-                    _bm->try_populate(next.index(), [&](void *buffer) {
-#ifdef CYCLE_PROFILING
-                        auto p=prof.ScopedProfiler(PROFILE_PREPB, (next.xmax()-next.x0()) * (next.kmax()-next.k0()) * sizeof(Toi));
-#endif
-
-                        Toi *b_panel = reinterpret_cast<Toi *>(buffer);
-
-                        strat.transforms.PrepareB(b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
-                                                  next.x0(), next.xmax(), next.k0(), next.kmax(), _trB);
-                    });
-                }
-
-                /* Get the buffer for this iteration from the BufferManager. */
-                b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv) {
-#ifdef CYCLE_PROFILING
-                    auto p=prof.ScopedProfiler(PROFILE_PREPB, (current.xmax()-current.x0()) * (current.kmax()-current.k0()) * sizeof(Toi));
-#endif
-
-                    Toi *b_panel = reinterpret_cast<Toi *>(bpv);
-
-                    strat.transforms.PrepareB(b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
-                                              current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
-                }));
-            }
-
             /* Do the actual work. */
             for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
                 unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
@@ -308,126 +336,15 @@
                 }
             }
 
-            if (pretransposed) {
-                b_panel += (bblocks * strat.out_width() * kern_k);
-            } else {
-                _bm->release(current.index());
-            }
+            b_panel += (bblocks * strat.out_width() * kern_k);
         }
     }
 
-public:
-    GemmInterleaved(GemmInterleaved &) = delete;
-    GemmInterleaved & operator= (GemmInterleaved &) = delete;
-
-    /* Constructor */
-    GemmInterleaved(const GemmArgs &args)
-                    : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
-                      _nbatches(args._nbatches), _nmulti(args._nmulti), _trA(args._trA), _trB(args._trB),
-                      _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
-                      _pretransposed(args._pretransposed_hint) {
-        const unsigned int L1_size = _ci->get_L1_cache_size();
-        const unsigned int L2_size = _ci->get_L2_cache_size();
-
-        assert(_maxthreads > 0);
-
-        // Work out blocking parameters, or override from provided GemmConfig
-        if (args._cfg && args._cfg->inner_block_size) {
-            _k_block = args._cfg->inner_block_size;
-        } else {
-            // k_block: Find out how much of the larger array can be loaded into half the cache.
-            // This should account for associative caches.
-            _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
-            // Needs to be (at least a single) multiple of the K unroll level.
-            _k_block /= strategy::k_unroll();
-            _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
-
-            // Now tune to presented problem size; this is how many blocks we need.
-            unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
-
-            // So divide the space equally into that many blocks.
-            _k_block = iceildiv(_Ksize, num_k_blocks);
-
-            // And round UP to the K unroll level required.
-            _k_block = iceildiv(_k_block, strategy::k_unroll());
-            _k_block *= strategy::k_unroll();
-        }
-
-        if (args._cfg && args._cfg->outer_block_size) {
-            _x_block = args._cfg->outer_block_size;
-        } else {
-            // x_block: Work out how many rows (of length k_block) will fit in the L2
-            // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-            _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                      (sizeof(Toi) * _k_block);
-
-            // Needs to be (at least a single) multiple of the kernel output width.
-            _x_block /= strategy::out_width();
-            _x_block = std::max(_x_block, 1U) * strategy::out_width();
-
-            // And tune to the presented problem size.
-            unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
-            _x_block = iceildiv(_Nsize, num_x_blocks);
-
-            _x_block = iceildiv(_x_block, strategy::out_width());
-            _x_block *= strategy::out_width();
-        }
-
-        // Work out the rounded size of M - needed for some buffers.
-        _Mround = iceildiv(_Msize, strategy::out_height());
-        _Mround *= strategy::out_height();
-    }
-
-    // Interface implementation - Compulsory functions
-
-    // Window size: Only the last thread should do a ragged block, so dole
-    // out work in units of out_height.  Factor batches into the window, but
-    // not multi for now (as this would cause problems with the buffer
-    // manager).
-    ndrange_t get_window_size() const override {
-        auto m_win_size = (_Mround / strategy::out_height()) * _nbatches;
-        return { m_win_size, 1u, 1u, 1u, 1u, 1u };
-    }
-
-    // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
-    void set_nthreads(int nthreads) override {
-        _nthreads = std::min(nthreads, _maxthreads);
-        if (_bm) {
-            _bm->set_nthreads(_nthreads);
-        }
-    }
-
-    // Execute
-    void execute_1d(unsigned int start, unsigned int end, int threadid) {
-        if (_pretransposed) {
-            execute_internal<true>(start, end, threadid);
-        } else {
-            execute_internal<false>(start, end, threadid);
-        }
-    }
-
-    //Execute
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
-        UNUSED(thread_locator);
-
-        const auto start = work_range.get_position(0);
-        const auto stop  = work_range.get_position_end(0);
-
-        execute_1d(start, stop, threadid);
-    }
-
     // Interface implementation - working space
     size_t get_working_size() const override {
         // In all cases, we need one A buffer plus a C buffer per thread.
         size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
 
-        // For pretransposed case, there is no working space needed for B.
-        // Otherwise, we need a BufferManager.
-        if (!_pretransposed) {
-            size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
-        }
-
         size += 64; // Add on a cache line extra for alignment.
 
         return size;
@@ -446,29 +363,17 @@
 
         working_space_bytes += diff;
 
-        if (_pretransposed) {
-            // Pretransposed case: just set internal pointer to parameter value.
-            _working_space = reinterpret_cast<void *>(working_space_bytes);
-        } else {
-            // Otherwise, use the first part of the working space for the buffer manager.
-            // It's legal to call this again so don't leak a buffer manager if it already existed.
-            delete _bm;
-
-            _bm = new BufferManager(_nthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
-
-            working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
-
-            _working_space = reinterpret_cast<void *>(working_space_bytes);
-        }
+        // Pretransposed case: just set internal pointer to parameter value.
+        _working_space = reinterpret_cast<void *>(working_space_bytes);
     }
 
     // Interface implementation - pretransposed
     bool B_is_pretransposed() const override {
-        return _pretransposed;
+        return true;
     }
 
     bool B_pretranspose_required() const override {
-        return _pretransposed && (_B_transposed==nullptr);
+        return (_B_transposed==nullptr);
     }
 
     // TODO: this could almost certainly be considerably simpler.
@@ -513,7 +418,7 @@
             k_size *= strategy::k_unroll();
 
             strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
-                                      current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
+                                      current.x0(), current.xmax(), current.k0(), current.kmax());
 
             buffer += (x_size * k_size);
         } while (current.advance());
@@ -523,8 +428,29 @@
         _B_transposed = reinterpret_cast<Toi *>(in_buffer);
     }
 
-    ~GemmInterleaved() override {
-        delete _bm;
+    // Estimate cycles for given problem given provided parameters
+    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+        unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
+
+        uint64_t total_macs    = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+        uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
+        uint64_t merge_bytes   = static_cast<uint16_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
+
+        float mac_cycles     = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+        float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
+        float merge_cycles   = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
+
+        float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
+
+        // We can't thread over multis or width, which makes this a poor
+        // choice in many threaded cases.  Penalize that here.
+        float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches) * 0.9f;
+
+        if (parallelism_available < args._maxthreads) {
+            total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
+        }
+
+        return static_cast<uint64_t>(total_cycles);
     }
 };
 

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp
deleted file mode 100644
index 53f8e6c..0000000
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp
+++ /dev/null

@@ -1,449 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_gemm.hpp"
-#include "utils.hpp"
-
-#include "mergeresults.hpp"
-#include "transform.hpp"
-
-#ifdef CYCLE_PROFILING
-#include "profiler.hpp"
-#endif
-
-#include <algorithm>
-#include <cassert>
-
-// Some macros used to decide how much working space to allocate.
-// Round allocations up to the next cache line.
-#define ALLOC_ROUND    64
-#define ROUND_UP(x)    ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
-
-// Implementation of the GemmCommon abstract class.
-//
-// This implementation interleaves the source matrices in blocks - good for
-// larger matrices.
-namespace arm_gemm {
-
-template<typename strategy, typename To, typename Tr>
-class GemmInterleaved2d : public GemmCommon<To, Tr> {
-    typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type Tri;
-
-    /* const properties set by constructor */
-    const CPUInfo * const _ci;
-
-    const unsigned int _Msize;
-    const unsigned int _Nsize;
-    const unsigned int _Ksize;
-
-    const unsigned int _nbatches;
-    const unsigned int _nmulti;
-
-    const bool _trA;
-    const bool _trB;
-
-    const Activation _act;
-
-    const int _maxthreads;
-    int _nthreads;
-
-    /* Blocking info */
-    unsigned int _k_block=0;
-    unsigned int _x_block=0;
-
-    unsigned int _Mround_div=0;
-    unsigned int _Mround=0;
-    unsigned int _Nround_div=0;
-    unsigned int _Nround=0;
-
-    /* Working space, pretransposed buffer */
-    void *_working_space=nullptr;
-
-    /* We will need to walk through the blocks of B in a few contexts, so
-     * factor that out.  */
-    class blockwalker {
-    private:
-        /* Size loops, etc. based on our parent's configuration */
-        const GemmInterleaved2d<strategy, To, Tr> &_parent;
-
-        /* K, X and multi parameters for current iteration. */
-        unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0;
-
-        unsigned int _index=0;
-        bool _done=false;
-        bool _newkblock=true;
-        bool _newmulti=true;
-
-    public:
-        blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent)
-        : _parent(parent)
-        , _xmax { parent._Nsize }
-        { }
-
-        blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax)
-        : _parent(parent)
-        , _x0   { x0   }
-        , _xmin { x0   }
-        , _xmax { xmax }
-        {
-            assert(_x0 <= _xmax);
-        }
-
-        unsigned int xmax() {
-            return std::min(_x0 + _parent._x_block, _xmax);
-        }
-
-        unsigned int kmax() {
-            return std::min(_k0 + _parent._k_block, _parent._Ksize);
-        }
-
-        /* Advance to the next block, return false at the end. */
-        bool advance(void) {
-            if (_done) {
-                return false;
-            }
-
-            _newkblock=false;
-            _x0 += _parent._x_block;
-            if (_x0 >= _xmax) {
-                _x0=_xmin;
-                _k0 += _parent._k_block;
-                if (_k0 >= _parent._Ksize) {
-                    _k0=0;
-                    _multi++;
-                    if (_multi >= _parent._nmulti) {
-                        _done=true;
-                        return false;
-                    }
-                    _newmulti=true;
-                }
-                _newkblock=true;
-            }
-            _index++;
-
-            return true;
-        }
-
-        unsigned int k0(void) { return _k0; }
-        unsigned int x0(void) { return _x0; }
-        unsigned int multi(void) { return _multi; }
-        unsigned int index(void) { return _index; }
-        bool done(void) { return _done; }
-        bool newkblock(void) { return _newkblock; }
-    };
-
-    // A working size: One of these needed, regardless of thread count.  Divided according to window.
-    size_t get_a_working_size() const {
-        return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2;
-    }
-
-    // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
-    size_t get_b_working_size() const {
-        return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
-    }
-
-    // C working size: One needed per thread.
-    size_t get_c_working_size() const {
-        return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
-    }
-
-    void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) {
-        UNUSED(mthreadid);
-
-        strategy strat(_ci);
-
-        /* Translate 'start' and 'end' into a position within the batches and rows. */
-        const unsigned int window_per_batch = _Mround / strategy::out_height();
-        unsigned int batch_0   = m_start / window_per_batch;
-        unsigned int batch_end = m_end   / window_per_batch;
-
-        /* Compute the M values to operate on */
-        unsigned int m_0   = (m_start - (batch_0 * window_per_batch)) * strategy::out_height();
-        unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height();
-
-        unsigned int n_0   = std::min(this->_Nsize, strategy::out_width() * n_start);
-        unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end);
-
-        blockwalker current(*this, n_0, n_max);
-
-        /* get workspace as int8_t */
-        assert(_working_space);
-        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
-
-        auto c_panel_start = working_space_bytes;
-        auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads;
-        auto b_panel_start = a_panel_start + get_a_working_size() * _maxthreads;
-
-        auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid);
-        auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * nthreadid);
-        auto b_panel = reinterpret_cast<Toi *>(b_panel_start + get_b_working_size() * threadid);
-
-
-        // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
-
-        int kern_k = 0;
-        for (;!current.done();current.advance()) {
-              const int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
-            /*
-             * The entirity of A^kblock is transpose upfront and computed against individual
-             * blocks of B (xblock)
-             *
-             * Therefore, we only need to retranspose when k_block progresses
-             */
-            if (current.newkblock()) {
-                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
-                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
-                    unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
-
-                    if (first_m >= last_m)
-                        continue;
-
-                    auto a_thread_panel_in  = this->_Aptr
-                                            + (batch * this->_A_batch_stride)
-                                            + (current.multi() * this->_A_multi_stride);
-
-                    auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block);
-
-                    strat.transforms.PrepareA(
-                        a_thread_panel_out,
-                        a_thread_panel_in,
-                        this->_lda,
-                        first_m,
-                        last_m,
-                        current.k0(),
-                        current.kmax(),
-                        _trA);
-                }
-
-                kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
-                kern_k *= strat.k_unroll();
-            }
-
-            auto *b_panel_in = this->_Bptr + (current.multi() * this->_B_multi_stride);
-
-            strat.transforms.PrepareB(
-                b_panel,    //dst
-                b_panel_in, //src
-                this->_ldb,
-                current.x0(),   //idx from
-                current.xmax(), //idx to
-                current.k0(),
-                current.kmax(),
-                _trB);
-
-            //Iterate over the batches
-            for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
-                unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
-                unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
-
-                if (first_m >= last_m)
-                    continue;
-
-                const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
-
-
-                //Iterate over the inerleaved rows of the packed A matrix
-                for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
-                    unsigned int ymax = std::min(_Msize, y + strategy::out_height());
-
-                    strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
-                    a_ptr += (strategy::out_height() * kern_k);
-
-                    const bool first_pass = current.k0()==0;
-                    const bool last_pass  = current.kmax()==_Ksize;
-
-                    auto c_panel_out = this->_Cptr
-                                     + this->_C_batch_stride * batch
-                                     + this->_C_multi_stride * current.multi();
-
-                    auto bias        = (first_pass && this->_bias)
-                                     ? this->_bias + (current.multi() * this->_bias_multi_stride)
-                                     : nullptr;
-
-                    auto act        = last_pass ? _act : Activation();
-
-                    strat.transforms.Merge(
-                        c_panel_out,
-                        c_panel,
-                        this->_ldc,
-                        y,
-                        ymax,
-                        current.x0(),
-                        current.xmax(),
-                        bias,
-                        act,
-                        !first_pass);  //Append
-                }
-            }
-        }
-    }
-public:
-    GemmInterleaved2d(GemmInterleaved2d &) = delete;
-    GemmInterleaved2d & operator= (GemmInterleaved2d &) = delete;
-
-    /* Constructor */
-    /* Constructor */
-    GemmInterleaved2d(const GemmArgs &args)
-    :    _ci(args._ci)
-    ,    _Msize(args._Msize)
-    ,    _Nsize(args._Nsize)
-    ,    _Ksize(args._Ksize)
-    ,    _nbatches(args._nbatches)
-    ,    _nmulti(args._nmulti)
-    ,    _trA(args._trA)
-    ,    _trB(args._trB)
-    ,    _act(args._act)
-    ,    _maxthreads(args._maxthreads)
-    ,    _nthreads(args._maxthreads) 
-
-    // Work out the rounded size of M - needed for some buffers.
-    ,    _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
-    ,    _Mround     ( _Mround_div * strategy::out_height()     )
-
-    ,    _Nround_div ( iceildiv(_Nsize, strategy::out_width()) )
-    ,    _Nround     ( _Nround_div * strategy::out_width()     )
-    {
-        const unsigned int L1_size = _ci->get_L1_cache_size();
-        const unsigned int L2_size = _ci->get_L2_cache_size();
-
-        assert(_maxthreads > 0);
-
-        // Work out blocking parameters, or override from provided GemmConfig
-        if (args._cfg && args._cfg->inner_block_size) {
-            _k_block = args._cfg->inner_block_size;
-        } else {
-            // k_block: Find out how much of the larger array can be loaded into half the cache.
-            // This should account for associative caches.
-            _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
-            // Needs to be (at least a single) multiple of the K unroll level.
-            _k_block /= strategy::k_unroll();
-            _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
-
-            // Now tune to presented problem size; this is how many blocks we need.
-            unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
-
-            // So divide the space equally into that many blocks.
-            _k_block = iceildiv(_Ksize, num_k_blocks);
-
-            // And round UP to the K unroll level required.
-            _k_block = iceildiv(_k_block, strategy::k_unroll());
-            _k_block *= strategy::k_unroll();
-        }
-
-        if (args._cfg && args._cfg->outer_block_size) {
-            _x_block = args._cfg->outer_block_size;
-        } else {
-            // x_block: Work out how many rows (of length k_block) will fit in the L2
-            // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-            _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
-                      (sizeof(Toi) * _k_block);
-
-            // Needs to be (at least a single) multiple of the kernel output width.
-            _x_block /= strategy::out_width();
-            _x_block = std::max(_x_block, 1U) * strategy::out_width();
-
-            // And tune to the presented problem size.
-            unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
-            _x_block = iceildiv(_Nsize, num_x_blocks);
-
-            _x_block = iceildiv(_x_block, strategy::out_width());
-            _x_block *= strategy::out_width();
-        }
-
-        // Work out the rounded size of M - needed for some buffers.
-    }
-
-    // Interface implementation - Compulsory functions
-    ndrange_t get_window_size() const override {
-        unsigned m = (_Mround / strategy::out_height()) * _nbatches;
-        unsigned n = _Nround_div;
-
-        return { m, n, 1u, 1u, 1u, 1u };
-    }
-
-    // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
-    void set_nthreads(int nthreads) override {
-        _nthreads = std::min(nthreads, _maxthreads);
-    }
-
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
-        /*
-         * This particular GEMM implementation can only be broken up over the M & N
-         * dimensions, we inform the frame work of this limitation via the get_window_size function
-         */
-        assert(ndrange_popcount(work_range) <= 2);
-
-        const auto m_start = work_range.get_position(0);
-        const auto n_start = work_range.get_position(1);
-        const auto m_size  = work_range.get_size(0);
-        const auto n_size  = work_range.get_size(1);
-        const auto m_end   = m_start + m_size;
-        const auto n_end   = n_start + n_size;
-
-        const auto m_threadid = thread_locator.get_position(0);
-        const auto n_threadid = thread_locator.get_position(1);
-
-        execute_transpose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
-    }
-
-    std::size_t get_working_size()const override {
-        /*
-         * Because we do not know how schedular will break up
-         * the task, we need to ensure that alloc enough
-         * space to be able to handle the case where every thread
-         * is parallelised across B AND also every thrread is parallelised across A
-         *
-         * If we parallelise across A, then we only need one buffer of A and 64 buffers of B
-         * If we parallelise across B, then we only need 64 buffer of B and
-          */
-        return get_c_working_size() * _maxthreads
-             + get_a_working_size() * _maxthreads
-             + get_b_working_size() * _maxthreads
-             + 64; //to account for cacheline alignment
-    }
-
-
-    void set_working_space(void *working_space) override {
-        // Make sure everything ends up cache line aligned
-        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
-        intptr_t working_space_int  = reinterpret_cast<intptr_t>(working_space);
-
-        size_t diff=0;
-
-        if (working_space_int & 0x3F) {
-            diff = 0x40 - (working_space_int & 0x3F);
-        }
-
-        working_space_bytes += diff;
-
-        _working_space = reinterpret_cast<void *>(working_space_bytes);
-    }
-
-    ~GemmInterleaved2d() override { }
-};
-
-} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
index eff4877..bdccd05 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,6 +35,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cmath>
 
 // Some macros used to decide how much working space to allocate.
 // Round allocations up to the next cache line.
@@ -62,9 +63,6 @@
     const unsigned int _nbatches;
     const unsigned int _nmulti;
 
-    const bool _trA;
-    const bool _trB;
-
     const Activation _act;
 
     const int _maxthreads;
@@ -173,16 +171,13 @@
 
     // Internal execute function.
     // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
-    void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) {
+    void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int, int) {
         /* Make sure we've been set up correctly. */
         assert(_B_transposed);
         assert(_working_space);
         assert(this->_Aptr);
         assert(this->_Cptr);
 
-        UNUSED(mthreadid);
-        UNUSED(nthreadid);
-
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
@@ -255,8 +250,7 @@
                         first_m,
                         last_m,
                         current.k0(),
-                        current.kmax(),
-                        _trA);
+                        current.kmax());
                 }
             }
 
@@ -308,6 +302,36 @@
         }
     }
 
+    static unsigned int get_k_block_size(const GemmArgs &args) {
+        // Work out blocking parameters, or override from provided GemmConfig
+        if (args._cfg && args._cfg->inner_block_size) {
+            return args._cfg->inner_block_size;
+        }
+
+        const unsigned int L1_size = args._ci->get_L1_cache_size();
+        unsigned int k_block;
+
+        // k_block: Find out how much of the larger array can be loaded into half the cache.
+        // This should account for associative caches.
+        k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+        // Needs to be (at least a single) multiple of the K unroll level.
+        k_block /= strategy::k_unroll();
+        k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+        // Now tune to presented problem size; this is how many blocks we need.
+        unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+
+        // So divide the space equally into that many blocks.
+        k_block = iceildiv(args._Ksize, numk_blocks);
+
+        // And round UP to the K unroll level required.
+        k_block = iceildiv(k_block, strategy::k_unroll());
+        k_block *= strategy::k_unroll();
+
+        return k_block;
+    }
+
 public:
     GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete;
     GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete;
@@ -320,12 +344,10 @@
     ,    _Ksize(args._Ksize)
     ,    _nbatches(args._nbatches)
     ,    _nmulti(args._nmulti)
-    ,    _trA(args._trA)
-    ,    _trB(args._trB)
     ,    _act(args._act)
     ,    _maxthreads(args._maxthreads)
-    ,    _nthreads(args._maxthreads) 
-
+    ,    _nthreads(args._maxthreads)
+    ,    _k_block(get_k_block_size(args))
     // Work out the rounded size of M - needed for some buffers.
     ,    _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
     ,    _Mround     ( _Mround_div * strategy::out_height()     )
@@ -333,36 +355,10 @@
     ,    _Nround_div ( iceildiv(_Nsize, strategy::out_width()) )
     ,    _Nround     ( _Nround_div * strategy::out_width()     )
     {
-
-        assert(args._pretransposed_hint);
         assert(_maxthreads > 0);
 
-        const unsigned int L1_size = _ci->get_L1_cache_size();
         const unsigned int L2_size = _ci->get_L2_cache_size();
 
-        // Work out blocking parameters, or override from provided GemmConfig
-        if (args._cfg && args._cfg->inner_block_size) {
-            _k_block = args._cfg->inner_block_size;
-        } else {
-            // k_block: Find out how much of the larger array can be loaded into half the cache.
-            // This should account for associative caches.
-            _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
-            // Needs to be (at least a single) multiple of the K unroll level.
-            _k_block /= strategy::k_unroll();
-            _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
-
-            // Now tune to presented problem size; this is how many blocks we need.
-            unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
-
-            // So divide the space equally into that many blocks.
-            _k_block = iceildiv(_Ksize, num_k_blocks);
-
-            // And round UP to the K unroll level required.
-            _k_block = iceildiv(_k_block, strategy::k_unroll());
-            _k_block *= strategy::k_unroll();
-        }
-
         if (args._cfg && args._cfg->outer_block_size) {
             _x_block = args._cfg->outer_block_size;
         } else {
@@ -389,7 +385,11 @@
         unsigned m = (_Mround / strategy::out_height()) * _nbatches;
         unsigned n = _Nround_div;
 
-        return { m, n, 1u, 1u, 1u, 1u };
+        return { m, n };
+    }
+
+    bool supports_dynamic_scheduling() const override {
+        return true;
     }
 
     // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
@@ -401,8 +401,6 @@
         /* This particular GEMM implementation can only be broken up over the M & N
          * dimensions, we inform the frame work of this limitation via the get_window_size function
          */
-        assert(ndrange_popcount(work_range) <= 2);
-
         const auto m_start = work_range.get_position(0);
         const auto n_start = work_range.get_position(1);
         const auto m_size  = work_range.get_size(0);
@@ -416,7 +414,7 @@
         execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
     }
 
-    std::size_t get_working_size()const override {
+    std::size_t get_working_size() const override {
         /* Because we do not know how schedular will break up
          * the task, we need to ensure that alloc enough
          * space to be able to handle the case where every thread
@@ -498,7 +496,7 @@
             k_size *= strategy::k_unroll();
 
             strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
-                                      current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
+                                      current.x0(), current.xmax(), current.k0(), current.kmax());
 
             buffer += (x_size * k_size);
         } while (current.advance());
@@ -508,7 +506,60 @@
         _B_transposed = reinterpret_cast<Toi *>(in_buffer);
     }
 
-    ~GemmInterleavedPretransposed2d() override { }
+    // Estimate cycles for given problem given provided parameters
+    static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+        unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
+        unsigned int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
+        unsigned int n_blocks = iceildiv(args._Nsize, strategy::out_width());
+
+        uint64_t total_macs    = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+        uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
+        uint64_t merge_bytes   = static_cast<uint64_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
+
+        // Wide problems incur extra preparation cost, as it is done per thread.
+        // Duplicate the logic the scheduler will later use to figure out how much that will affect us
+        float ratio = m_blocks / static_cast<float>(n_blocks);
+
+        unsigned int ideal_height = static_cast<unsigned int>(std::sqrt(args._maxthreads * ratio) + 0.5);
+        unsigned int height = 1;
+
+        if (ideal_height == 0) {
+            height = 1;
+        } else {
+            for (unsigned int adj=0; adj<ideal_height; adj++) {
+                const unsigned int round_down = ideal_height - adj;
+                if (args._maxthreads % round_down == 0) {
+                    height = round_down;
+                    break;
+                }
+
+                const unsigned int round_up = ideal_height + adj;
+                if (args._maxthreads % round_up == 0) {
+                    height = round_up;
+                    break;
+                }
+            }
+        }
+
+        // We've computed the height here - we need to multiply the amount of preparation effort by the width (which is total threads / height)
+        prepare_bytes *= (args._maxthreads / height);
+
+        float mac_cycles     = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+        float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
+        float merge_cycles   = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
+
+        float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
+
+        // We can't thread over multis, which might be a problem in some
+        // threaded cases.  Penalize that here.
+        float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches * iceildiv(args._Nsize, strategy::out_width())) * 0.9;
+
+        if (parallelism_available < args._maxthreads) {
+            total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
+        }
+
+        return static_cast<uint64_t>(total_cycles);
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
deleted file mode 100644
index fb01a73..0000000
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ /dev/null

@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <stdio.h>
-
-#include "arm_gemm.hpp"
-
-#include "arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp"
-
-#ifdef CYCLE_PROFILING
-#include "profiler.hpp"
-#endif
-
-namespace arm_gemm {
-
-// Implementation of the GemmCommon abstract class.
-//
-// This is implementation is for native GEMM with no transposition.
-//
-// By default the source data is used in-place, but if type conversion is
-// needed we need to allocate working space (CURRENTLY NOT IMPLEMENTED).
-
-template<typename strategy, typename To, typename Tr>
-class GemmNative : public GemmCommon<To, Tr> {
-    typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type Tri;
-
-    const unsigned int _Msize;
-    const unsigned int _Nsize;
-    const unsigned int _Ksize;
-
-    const unsigned int _nbatches;
-    const unsigned int _nmultis;
-
-    const Activation _act;
-
-    const CPUInfo * const _ci;
-
-    const unsigned int _k_block;
-    const unsigned int _n_block;
-
-    const NDRange<4> _window_range;
-
-    static unsigned int compute_k_block(const GemmArgs &args) {
-        return args._Ksize;
-    }
-
-    static unsigned int compute_n_block(const GemmArgs &args) {
-        if ((args._cfg != nullptr) && args._cfg->outer_block_size > 0) {
-            return args._cfg->outer_block_size;
-        } else {
-            return args._Nsize;
-        }
-    }
-
-public:
-    GemmNative(GemmNative &) = delete;
-    GemmNative & operator= (GemmNative &) = delete;
-
-    GemmNative(const GemmArgs &args)
-               : _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
-                 _nbatches(args._nbatches), _nmultis(args._nmulti),
-                 _act(args._act), _ci(args._ci),
-                 _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
-                 _window_range(iceildiv(_Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmultis) { }
-
-    // Window is amount per multi multiplied by total number of multis.
-    ndrange_t get_window_size() const override {
-        return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
-    }
-
-    // Native GEMMs can always be dynamically scheduled (whether requested or not)
-    bool supports_dynamic_scheduling() const override {
-        return true;
-    }
-
-    // Actually execute the GEMM.
-    void execute_1d(unsigned int start, unsigned int end, int) {
-#ifdef CYCLE_PROFILING
-        profiler prof;
-#endif
-        strategy strat(_ci);
-
-        static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
-        static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
-
-        auto p = _window_range.iterator(start, end);
-
-        if (p.done()) {
-            return;
-        }
-
-        do {
-            unsigned int y0    = p.dim(0) * strategy::out_height();
-            unsigned int ymax  = std::min(p.dim0_max() * strategy::out_height(), _Msize);
-            unsigned int batch = p.dim(1);
-            unsigned int n0    = p.dim(2) * _n_block;
-            unsigned int nmax  = std::min(n0 + _n_block, _Nsize);
-            unsigned int multi = p.dim(3);
-
-#ifdef CYCLE_PROFILING
-            auto p = prof.ScopedProfiler(PROFILE_KERNEL, (ymax-y0) * (nmax - n0) * _Ksize);
-#endif
-
-            strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (y0 * this->_lda), this->_lda,
-                         this->_Bptr + (multi * this->_B_multi_stride) + n0, this->_ldb,
-                         this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc) + n0, this->_ldc,
-                         (ymax-y0), (nmax-n0), _Ksize,
-                         (strategy::supports_bias() && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
-                         _act, false);
-
-            // Add bias externally if needed
-            if (!strategy::supports_bias() && this->_bias) {
-                bias_adder(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (y0 * this->_ldc) + n0, this->_ldc,
-                           this->_bias + (multi * this->_bias_multi_stride) + n0,
-                           (ymax - y0), (nmax - n0));
-            }
-        } while (p.next_dim1());
-    }
-
-    //Execute
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
-        UNUSED(thread_locator);
-
-        const auto start = work_range.get_position(0);
-        const auto stop  = work_range.get_position_end(0);
-
-        execute_1d(start, stop, threadid);
-    }
-};
-
-} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
index 73d0c27..04cac60 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,14 +42,14 @@
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "smallK_hybrid_s8s32_dot_1VLx8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64; },
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_1VLx8, int8_t, int8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "hybrid_s8s32_dot_4VLx4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16; },
     [](const GemmArgs &args, const Requantize32 &) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_s8s32_dot_4VLx4, int8_t, int8_t>(args, qp); }
 },
@@ -57,24 +57,32 @@
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "smallK_hybrid_s8s32_dot_4x8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_4x8, int8_t, int8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "smallK_hybrid_s8s32_dot_4x6",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_4x6, int8_t, int8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "hybrid_s8s32_dot_16x4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16; },
     [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_s8s32_dot_16x4, int8_t, int8_t>(args, qp); }
 },
+/** QUANTIZE_WRAPPER_2D enables 2D parallelisation hint for IScheduler in NEGEMMAssemblyDispatch */
+{
+    GemmMethod::QUANTIZE_WRAPPER_2D,
+    "quantized_wrapper_2d",
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8);},
+    [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<int8_t, int8_t, int32_t>(args, qp); }
+},
 {
     GemmMethod::QUANTIZE_WRAPPER,
     "quantized_wrapper",

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
index 59cd170..0125f9c 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,14 +42,14 @@
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "smallK_hybrid_u8u32_dot_1VLx8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64; },
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_1VLx8, uint8_t, uint8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "hybrid_u8u32_dot_4VLx4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16; },
     [](const GemmArgs &args, const Requantize32 &) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_u8u32_dot_4VLx4, uint8_t, uint8_t>(args, qp); }
 },
@@ -57,24 +57,32 @@
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "smallK_hybrid_u8u32_dot_4x8",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_4x8, uint8_t, uint8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "smallK_hybrid_u8u32_dot_4x6",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
     nullptr,
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_4x6, uint8_t, uint8_t>(args, qp); }
 },
 {
     GemmMethod::GEMM_HYBRID_QUANTIZED,
     "hybrid_u8u32_dot_16x4",
-    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
-    [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
+    [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16; },
+    [](const GemmArgs &args, const Requantize32 &) { return ((args._Nsize<=256) && (args._Ksize>128)) || (args._maxthreads >= 8); },
     [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_u8u32_dot_16x4, uint8_t, uint8_t>(args, qp); }
 },
+/** QUANTIZE_WRAPPER_2D enables 2D parallelisation hint for IScheduler in NEGEMMAssemblyDispatch */
+{
+    GemmMethod::QUANTIZE_WRAPPER_2D,
+    "quantized_wrapper_2d",
+    nullptr,
+    [](const GemmArgs &args, const Requantize32 &) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8);},
+    [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<uint8_t, uint8_t, uint32_t>(args, qp); }
+},
 {
     GemmMethod::QUANTIZE_WRAPPER,
     "quantized_wrapper",

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 85a8a67..5e06443 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 430d35e..88726b1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,8 +27,8 @@
 #include "gemm_common.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
+#include "gemm_interleaved_pretransposed_2d.hpp"
 #include "gemm_hybrid.hpp"
-#include "gemm_native.hpp"
 
 #include "kernels/a64_gemm_u16_12x8.hpp"
 #include "kernels/a64_gemm_u8_12x8.hpp"
@@ -40,14 +40,13 @@
 #include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
 #include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
 #include "kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp"
-#include "kernels/sve_native_u8u32_dot_4VLx4.hpp"
 #include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp"
 
 namespace arm_gemm {
 
 static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
 #ifdef __ARM_FEATURE_SVE
-#ifdef V8P6
+#ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
     "interleaved_u8u32_mmla_3VLx8",
@@ -59,25 +58,18 @@
 {
     GemmMethod::GEMM_HYBRID,
     "smallK_hybrid_u8u32_dot_1VLx8",
-    [](const GemmArgs &args) { return args._Ksize<=64 && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._Ksize<=64; },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_1VLx8, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "hybrid_u8u32_dot_4VLx4",
-    [](const GemmArgs &args) { return args._Ksize>=16 && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._Ksize>=16; },
     [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
 },
 {
-    GemmMethod::GEMM_NATIVE,
-    "native_u8u32_dot_4VLx4",
-    [](const GemmArgs &args) { return (args._Ksize>=16 && !args._trA && !args._trB); },
-    [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
-    [](const GemmArgs &args) { return new GemmNative<native_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
-},
-{
     GemmMethod::GEMM_INTERLEAVED,
     "interleaved_u8u32_dot_3VLx8",
     [](const GemmArgs &args) { return (args._Ksize>4); },
@@ -85,7 +77,7 @@
     [](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args); }
 },
 #endif
-#ifdef V8P6
+#ifdef MMLA_INT8
 {
     GemmMethod::GEMM_INTERLEAVED,
     "interleaved_u8u32_mmla_12x8",
@@ -97,34 +89,55 @@
 {
     GemmMethod::GEMM_HYBRID,
     "smallK_hybrid_u8u32_dot_4x8",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_4x8, uint8_t, uint32_t>(args); }
 },
 {
     GemmMethod::GEMM_HYBRID,
     "smallK_hybrid_u8u32_dot_4x6",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._trA && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
     nullptr,
     [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_4x6, uint8_t, uint32_t>(args); }
 },
 {
+    GemmMethod::GEMM_INTERLEAVED,
+    "gemm_u16_12x8",
+    nullptr,
+    [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; },
+    [](const GemmArgs &args) { return new GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t>(args); },
+},
+{
     GemmMethod::GEMM_HYBRID,
     "hybrid_u8u32_dot_16x4",
-    [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16 && !args._trA && !args._trB && args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; },
     [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
     [](const GemmArgs &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
 },
 {
+    GemmMethod::GEMM_INTERLEAVED_2D,
+    "gemm_u8_12x8_2d",
+    [](const GemmArgs &args) { return args._ci->has_dotprod(); },
+    [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8) ; },
+    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_u8_12x8, uint8_t, uint32_t>(args); }
+},
+{
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_u8_12x8",
+    "gemm_u8_12x8_1d",
     [](const GemmArgs &args) { return args._ci->has_dotprod(); },
     nullptr,
     [](const GemmArgs &args) { return new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(args); }
 },
 {
+    GemmMethod::GEMM_INTERLEAVED_2D,
+    "gemm_u8_4x4_2d",
+    nullptr,
+    [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8); },
+    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_u8_4x4, uint8_t, uint32_t>(args); }
+},
+{
     GemmMethod::GEMM_INTERLEAVED,
-    "gemm_u8_4x4",
+    "gemm_u8_4x4_1d",
     nullptr,
     nullptr,
     [](const GemmArgs &args) { return new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(args); }

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index 939788e..1221600 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,17 +45,15 @@
         _subgemm = gemm<To,Tr>(newargs);
     }
 
-    void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
+    void set_arrays(const To *A, const int, const int A_batch_stride, const int A_multi_stride,
                     const To *B, const int ldb, const int B_multi_stride,
-                          Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+                          Tr *C, const int, const int C_batch_stride, const int C_multi_stride,
                     const Tr *bias, const int bias_multi_stride) override {
         /* A and C's batch stride becomes their new row stride.  New batch stride is 0 as nbatches for subgemm is always 1. */
         _subgemm->set_arrays(A, A_batch_stride, 0, A_multi_stride,
                              B, ldb, B_multi_stride,
                              C, C_batch_stride, 0, C_multi_stride,
                              bias, bias_multi_stride);
-        UNUSED(lda);
-        UNUSED(ldc);
     }
 
     ndrange_t get_window_size() const override {
@@ -66,7 +64,7 @@
         _subgemm->set_nthreads(nthreads);
     }
 
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+    void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override {
         _subgemm->execute(work_range, thread_locator, threadid);
     }
 

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
deleted file mode 100644
index 190f4aa..0000000
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ /dev/null

@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <stdio.h>
-
-#include "arm_gemm.hpp"
-
-#include "mergeresults.hpp"
-#include "transform.hpp"
-
-#ifdef CYCLE_PROFILING
-#include "profiler.hpp"
-#endif
-
-namespace arm_gemm {
-
-// Implementation of the GemmCommon abstract class.
-//
-// This is implementation is for a "native" (no-transform) GEMV with a
-// transposed matrix.
-//
-// As a native operation the source data is used in-place, so the internal
-// and external operand/result types must match.
-template<typename strategy, typename To, typename Tr>
-class GemvNativeTransposed : public GemmCommon<To, Tr> {
-    typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type Tri;
-
-    const unsigned int _Nsize;
-    const unsigned int _Ksize;
-
-    const unsigned int _nmultis;
-
-    const Activation _act;
-
-    const CPUInfo * const _ci;
-
-    unsigned int m_block=0;
-    unsigned int n_block=0;
-
-public:
-    GemvNativeTransposed(GemvNativeTransposed &) = delete;
-    GemvNativeTransposed & operator= (GemvNativeTransposed &) = delete;
-
-    GemvNativeTransposed(const GemmArgs &args)
-                         : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _act(args._act), _ci(args._ci) {
-        /* For now don't do any blocking. TODO: figure out if we should. */
-        m_block = _Ksize;
-        n_block = _Nsize;
-    }
-
-    // Window is number of out_width blocks times number of multis.
-    ndrange_t get_window_size() const override {
-        return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u };
-    }
-
-    // Actually execute the GEMV.
-    void execute_1d(unsigned int start, unsigned int end, int) {
-#ifdef CYCLE_PROFILING
-        profiler prof;
-#endif
-        strategy strat(_ci);
-
-        const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
-        const unsigned int multi_0   = start / window_per_multi;
-        const unsigned int multi_end = end   / window_per_multi;
-
-        const unsigned int n_0   = (start - (multi_0 * window_per_multi)) * strategy::out_width();
-        const unsigned int n_max = (end - (multi_end * window_per_multi)) * strategy::out_width();
-
-        static_assert(std::is_same<To, Toi>::value, "gemv_transposed: Operand types must be the same.");
-        static_assert(std::is_same<Tr, Tri>::value, "gemv_transposed: Result types must be the same.");
-
-        for (unsigned int multi=multi_0; multi<=multi_end; multi++) {
-            const unsigned int n_start = (multi==multi_0) ? n_0 : 0;
-            const unsigned int n_end = (multi==multi_end) ? n_max : _Nsize;
-
-            if (n_end <= n_start)
-                continue;
-
-            for (unsigned int m0=0; m0<_Ksize; m0+=m_block) {
-                unsigned int mmax = std::min(m0 + m_block, _Ksize);
-
-                for (unsigned int n0=n_start; n0<n_end; n0+=n_block) {
-                    unsigned int nmax = std::min(n0 + n_block, n_end);
-#ifdef CYCLE_PROFILING
-                    auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n0));
-#endif
-                    strat.kernel(this->_Bptr + (multi * this->_B_multi_stride) + (m0 * this->_ldb) + n0,
-                                 this->_Aptr + (multi * this->_A_multi_stride) + m0,
-                                 this->_Cptr + (multi * this->_C_multi_stride) + n0,
-                                 static_cast<Tr>(0), this->_ldb, (mmax-m0), (nmax-n0));
-
-                    // Handle activation separately for now
-                    if (this->_bias) {
-                        activator<true>(this->_Cptr + (multi * this->_C_multi_stride) + n0, 0,
-                                        this->_bias + (multi * this->_bias_multi_stride) + n0,
-                                        _act, 1, (nmax-n0));
-                    } else {
-                        activator<false>(this->_Cptr + (multi * this->_C_multi_stride) + n0, 0,
-                                         static_cast<const Tr *>(nullptr),
-                                         _act, 1, (nmax-n0));
-                    }
-                }
-            }
-        }
-    }
-
-    // Execute
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
-        UNUSED(thread_locator);
-
-        const auto start = work_range.get_position(0);
-        const auto size  = work_range.get_size(0);
-        const auto stop  = start + size;
-
-        execute_1d(start, stop, threadid);
-    }
-};
-
-} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 7f52ac5..47909cd 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,8 +51,6 @@
 
     const unsigned int _nmultis;
 
-    const bool _trB;
-
     const Activation _act;
 
     const CPUInfo * const _ci;
@@ -69,7 +67,7 @@
     GemvPretransposed & operator= (GemvPretransposed &) = delete;
 
     GemvPretransposed(const GemmArgs &args)
-                      : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _trB(args._trB), _act(args._act), _ci(args._ci),
+                      : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _act(args._act), _ci(args._ci),
                         _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave()) * strategy::A_interleave()) {
         /* For now don't do any blocking. TODO: figure out if we should. */
         if (args._cfg && args._cfg->inner_block_size) {
@@ -87,16 +85,19 @@
 
     // Window is number of out_width blocks, times number of multis.
     ndrange_t get_window_size() const override {
-        return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u };
+        return { iceildiv(_Nsize, strategy::out_width()) * _nmultis };
     }
 
     // Actually execute the GEMV.
-    void execute_1d(unsigned int start, unsigned int end, int) {
+    void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
         strategy strat(_ci);
 
+        const auto start = work_range.get_position(0);
+        const auto end   = work_range.get_position_end(0);
+
         /* Break the window values down into multis of interest... */
         const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
         const unsigned int multi_0    = start / window_per_multi;
@@ -145,17 +146,6 @@
         }
     }
 
-    // Execute
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
-        UNUSED(thread_locator);
-
-        const auto start = work_range.get_position(0);
-        const auto size  = work_range.get_size(0);
-        const auto stop  = start + size;
-
-        execute_1d(start, stop, threadid);
-    }
-
     /* Pretransposed interface implementation */
     bool B_is_pretransposed() const override {
         return true;
@@ -177,7 +167,7 @@
             /* Reverse sense here as we are dealing with B rather than A.  So if
              * strategy::A_transpose is false and _trB is false, we still
              * transpose.  */
-            if (_trB ^ strategy::A_transpose()) {
+            if (strategy::A_transpose()) {
                 Transform<strategy::A_interleave(), strategy::A_block(), false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
             } else {
                 Transform<strategy::A_interleave(), strategy::A_block(), true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
index 554c9c7..ef175be 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
index faabf66..8a98f66 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a53.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
index 03880d8..8126826 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/a55r1.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
index 3c840af..a7494d5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a32_sgemm_8x6/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
index 8700c42..0f0e5a7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,7 +65,7 @@
 
     kern_type kernel = a64_gemm_s16_asimd_12x8;
 
-    gemm_s16_12x8(const CPUInfo *ci) { UNUSED(ci); }
+    gemm_s16_12x8(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
index 823079a..7052f83 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
index cc6c583..0e294bf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 // Load the actual kernel
 void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
 void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_12x8_x1(const int8_t *, const int8_t *, int32_t *, int, int, int);
 
 class gemm_s8_12x8 {
 public:
@@ -65,6 +66,8 @@
 
         if (mod == CPUModel::A55r1) {
             kernel = a64_gemm_s8_12x8_a55r1;
+        } else if (mod == CPUModel::X1) {
+            kernel = a64_gemm_s8_12x8_x1;
         }
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
index a2e0cbd..ddd8124 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
index c27c94b..a7abaed 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp
new file mode 100644
index 0000000..446fcf8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp

@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void a64_gemm_s8_12x8_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const int8_t *a_ptr = Apanel;
+    int32_t *c_ptr = Cpanel;
+    // We divide K by 4 because the sdot instruction processes 4 elements at a time.
+    const int W = K/4;
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk = (W & 1);
+    const int init_value_k = ((W+1)/2) - 1;
+    for (int yb=0; yb<ablocks; yb++) {
+        const int8_t *a_ptr0 = a_ptr;
+        const int8_t *b_ptr = Bpanel;
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            int k = init_value_k;
+            register uint8x16_t a0  asm("v0");
+            register uint8x16_t a1  asm("v1");
+            register uint8x16_t b0  asm("v2");
+            register uint8x16_t b1  asm("v3");
+            register uint8x16_t b2  asm("v4");
+
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0], [%[a_ptr], #32]\n"
+                ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[a1], [%[a_ptr], #48]\n"
+
+                ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+                ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
+
+                ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "subs	%w[k], %w[k], #1\n"
+                ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #112]\n"
+
+                ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "bne	1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0], [%[a_ptr], #-32]\n"
+                ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[a1], [%[a_ptr], #-16]\n"
+
+                ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+
+                ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+
+                ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                ".word 0x4f80e048 // sdot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                ".word 0x4f80e070 // sdot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                ".word 0x4fa0e049 // sdot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                ".word 0x4fa0e071 // sdot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                ".word 0x4f80e098 // sdot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                ".word 0x4fa0e099 // sdot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+
+                ".word 0x4f80e84a // sdot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                ".word 0x4f80e872 // sdot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                ".word 0x4f80e89a // sdot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                ".word 0x4fa0e84b // sdot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                ".word 0x4fa0e873 // sdot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                ".word 0x4fa0e89b // sdot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                ".word 0x4f81e04c // sdot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                ".word 0x4f81e074 // sdot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                ".word 0x4f81e09c // sdot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                ".word 0x4fa1e04d // sdot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                ".word 0x4fa1e075 // sdot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                ".word 0x4fa1e09d // sdot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                ".word 0x4f81e84e // sdot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                ".word 0x4f81e876 // sdot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                ".word 0x4f81e89e // sdot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                ".word 0x4fa1e84f // sdot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                ".word 0x4fa1e877 // sdot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                ".word 0x4fa1e89f // sdot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+
+                // Common tail
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
+
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index fb21bfc..256acc4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,7 +59,7 @@
 
     kern_type kernel=a64_gemm_s8_4x4;
 
-    gemm_s8_4x4(const CPUInfo *ci) { UNUSED(ci); }
+    gemm_s8_4x4(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
index 2fc54f8..3b9a855 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
index 971b027..b862040 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,7 +65,7 @@
 
     kern_type kernel = a64_gemm_u16_asimd_12x8;
 
-    gemm_u16_12x8(const CPUInfo *ci) { UNUSED(ci); }
+    gemm_u16_12x8(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
index cd0de36..66f0b7c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
index a67e2d6..c0990ec 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,7 @@
 // Load the actual kernel
 void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_12x8_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
 
 class gemm_u8_12x8 {
 public:
@@ -73,6 +74,8 @@
 
         if (mod == CPUModel::A55r1) {
             kernel = a64_gemm_u8_12x8_a55r1;
+        } else if (mod == CPUModel::X1) {
+            kernel = a64_gemm_u8_12x8_x1;
         }
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
index 35cb56b..c9a8a82 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
index 2ffda7d..821e742 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp
new file mode 100644
index 0000000..7fac673
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp

@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void a64_gemm_u8_12x8_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+    const uint8_t *a_ptr = Apanel;
+    uint32_t *c_ptr = Cpanel;
+    // We divide K by 4 because the udot instruction processes 4 elements at a time.
+    const int W = K/4;
+    // Fix up for odd lengths - set a flag if K is odd, but make
+    // sure we round up the iteration count.
+    const int oddk = (W & 1);
+    const int init_value_k = ((W+1)/2) - 1;
+    for (int yb=0; yb<ablocks; yb++) {
+        const uint8_t *a_ptr0 = a_ptr;
+        const uint8_t *b_ptr = Bpanel;
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            int k = init_value_k;
+            register uint8x16_t a0  asm("v0");
+            register uint8x16_t a1  asm("v1");
+            register uint8x16_t b0  asm("v2");
+            register uint8x16_t b1  asm("v3");
+            register uint8x16_t b2  asm("v4");
+
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0], [%[a_ptr], #32]\n"
+                ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[a1], [%[a_ptr], #48]\n"
+
+                ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+                ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
+
+                ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "subs	%w[k], %w[k], #1\n"
+                ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #112]\n"
+
+                ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "bne	1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "ldr	%q[a0], [%[a_ptr], #-32]\n"
+                ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "ldr	%q[a1], [%[a_ptr], #-16]\n"
+
+                ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+
+                ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+
+                ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                ".word 0x6f80e048 // udot v8.4s , %[b0].16b, %[a0].4b[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                ".word 0x6f80e070 // udot v16.4s, %[b1].16b, %[a0].4b[0]\n"
+                ".word 0x6fa0e049 // udot v9.4s , %[b0].16b, %[a0].4b[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                ".word 0x6fa0e071 // udot v17.4s, %[b1].16b, %[a0].4b[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                ".word 0x6f80e098 // udot v24.4s, %[b2].16b, %[a0].4b[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                ".word 0x6fa0e099 // udot v25.4s, %[b2].16b, %[a0].4b[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+
+                ".word 0x6f80e84a // udot v10.4s, %[b0].16b, %[a0].4b[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                ".word 0x6f80e872 // udot v18.4s, %[b1].16b, %[a0].4b[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                ".word 0x6f80e89a // udot v26.4s, %[b2].16b, %[a0].4b[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                ".word 0x6fa0e84b // udot v11.4s, %[b0].16b, %[a0].4b[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                ".word 0x6fa0e873 // udot v19.4s, %[b1].16b, %[a0].4b[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                ".word 0x6fa0e89b // udot v27.4s, %[b2].16b, %[a0].4b[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                ".word 0x6f81e04c // udot v12.4s, %[b0].16b, %[a1].4b[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                ".word 0x6f81e074 // udot v20.4s, %[b1].16b, %[a1].4b[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                ".word 0x6f81e09c // udot v28.4s, %[b2].16b, %[a1].4b[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                ".word 0x6fa1e04d // udot v13.4s, %[b0].16b, %[a1].4b[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                ".word 0x6fa1e075 // udot v21.4s, %[b1].16b, %[a1].4b[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                ".word 0x6fa1e09d // udot v29.4s, %[b2].16b, %[a1].4b[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                ".word 0x6f81e84e // udot v14.4s, %[b0].16b, %[a1].4b[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                ".word 0x6f81e876 // udot v22.4s, %[b1].16b, %[a1].4b[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                ".word 0x6f81e89e // udot v30.4s, %[b2].16b, %[a1].4b[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                ".word 0x6fa1e84f // udot v15.4s, %[b0].16b, %[a1].4b[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                ".word 0x6fa1e877 // udot v23.4s, %[b1].16b, %[a1].4b[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                ".word 0x6fa1e89f // udot v31.4s, %[b2].16b, %[a1].4b[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+
+                // Common tail
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
+
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 8bde3a6..134007b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,9 +67,7 @@
 
     kern_type kernel = a64_gemm_u8_4x4;
 
-    gemm_u8_4x4(const CPUInfo *ci) {
-        UNUSED(ci);
-    }
+    gemm_u8_4x4(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
index 2e60833..073aeab 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
index 8e17aa6..79cae60 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,7 @@
 // Actual kernel implementations
 void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
 void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_24x8_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
 
 // 24x8 HGEMM "strategy" class.  Describes the kernel properties.
 //
@@ -68,6 +69,8 @@
 
         if (model == CPUModel::A55r1) {
             kernel = a64_hgemm_asimd_24x8_a55r1;
+        } else if (model == CPUModel::X1) {
+            kernel = a64_hgemm_asimd_24x8_x1;
         }
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
index a3839ce..829ae30 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
index f32a627..657fade 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp
new file mode 100644
index 0000000..3bb8334
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp

@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+// Build on AArch64 where either FP16_KERNELS is set or FP16 is explicitly supported.
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 24x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm {
+
+void a64_hgemm_asimd_24x8_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+    const __fp16 *a_ptr = Apanel;
+    __fp16 *c_ptr = Cpanel;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const __fp16 *a_ptr0 = a_ptr;
+        const __fp16 *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k = ((K+1)/2) - 1;
+
+            register float16x8_t a0  asm("v0");
+            register float16x8_t a0a asm("v1");
+            register float16x8_t b0  asm("v2");
+            register float16x8_t b1  asm("v3");
+            register float16x8_t b2  asm("v4");
+
+            __asm __volatile (
+                // Enable FP16 instruction support (but only if it's not already on).
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                ".arch	armv8.2-a+fp16\n"
+#endif
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.8h, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.8h, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.8h, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v11.8h, #0x0\n"
+                "movi	v12.8h, #0x0\n"
+                "movi	v13.8h, #0x0\n"
+                "movi	v14.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v15.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v16.8h, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v17.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v18.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v19.8h, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.8h, #0x0\n"
+                "movi	v21.8h, #0x0\n"
+                "movi	v22.8h, #0x0\n"
+                "movi	v23.8h, #0x0\n"
+                "movi	v24.8h, #0x0\n"
+                "movi	v25.8h, #0x0\n"
+                "movi	v26.8h, #0x0\n"
+                "movi	v27.8h, #0x0\n"
+                "movi	v28.8h, #0x0\n"
+                "movi	v29.8h, #0x0\n"
+                "movi	v30.8h, #0x0\n"
+                "movi	v31.8h, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                "1:\n"
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "fmla	v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "ldr	%q[b1], [%[b_ptr], #-32]\n"
+
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #288]")
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #16]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
+                "ldr	%q[b2], [%[b_ptr], #-16]\n"
+
+                "fmla 	v8.8h , %[b0].8h, %[a0a].h[0]\n"
+                "fmla	v9.8h , %[b0].8h, %[a0a].h[1]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0a].h[2]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0a].h[3]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a0a].h[4]\n"
+                "fmla	v13.8h, %[b0].8h, %[a0a].h[5]\n"
+                "fmla	v14.8h, %[b0].8h, %[a0a].h[6]\n"
+                "fmla	v15.8h, %[b0].8h, %[a0a].h[7]\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+
+                "fmla	v16.8h, %[b1].8h, %[a0a].h[0]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0a].h[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #352]")
+                "fmla	v18.8h, %[b1].8h, %[a0a].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0a].h[3]\n"
+                "fmla	v20.8h, %[b1].8h, %[a0a].h[4]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0a].h[5]\n"
+                "fmla	v22.8h, %[b1].8h, %[a0a].h[6]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0a].h[7]\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+
+                "fmla	v24.8h, %[b2].8h, %[a0a].h[0]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0a].h[1]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v26.8h, %[b2].8h, %[a0a].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0a].h[3]\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0a].h[4]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0a].h[5]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v30.8h, %[b2].8h, %[a0a].h[6]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0a].h[7]\n"
+
+                "bne	1b\n"
+                "4:\n"
+
+                // Jump to odd tail if necessary.
+                "cbnz	%w[oddk], 2f\n"
+
+                // Even tail.
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "fmla   v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "fmla   v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "ldr	%q[b1], [%[b_ptr], #-32]\n"
+
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "ldr	%q[a0a], [%[a_ptr], #-16]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
+                "ldr	%q[b2], [%[b_ptr], #-16]\n"
+
+                "fmla 	v8.8h , %[b0].8h, %[a0a].h[0]\n"
+                "fmla	v16.8h, %[b1].8h, %[a0a].h[0]\n"
+                "str	q8, [%[c_ptr]]\n"
+                "fmla	v24.8h, %[b2].8h, %[a0a].h[0]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+
+                "fmla  	v9.8h , %[b0].8h, %[a0a].h[1]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0a].h[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0a].h[1]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+
+                "fmla	v10.8h, %[b0].8h, %[a0a].h[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v18.8h, %[b1].8h, %[a0a].h[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+                "fmla	v26.8h, %[b2].8h, %[a0a].h[2]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+
+                "fmla	v11.8h, %[b0].8h, %[a0a].h[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0a].h[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0a].h[3]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+
+                "fmla 	v12.8h, %[b0].8h, %[a0a].h[4]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v20.8h, %[b1].8h, %[a0a].h[4]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0a].h[4]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+
+                "fmla  	v13.8h, %[b0].8h, %[a0a].h[5]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0a].h[5]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0a].h[5]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+
+                "fmla	v14.8h, %[b0].8h, %[a0a].h[6]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v22.8h, %[b1].8h, %[a0a].h[6]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+                "fmla	v30.8h, %[b2].8h, %[a0a].h[6]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+
+                "fmla	v15.8h, %[b0].8h, %[a0a].h[7]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0a].h[7]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0a].h[7]\n"
+                "b	3f\n"
+
+                // Odd tail
+                "2:\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla 	v8.8h , %[b0].8h, %[a0].h[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "fmla	v16.8h, %[b1].8h, %[a0].h[0]\n"
+                "add	%[a_ptr], %[a_ptr], #16\n"
+                "str	q8, [%[c_ptr]]\n"
+                "fmla	v24.8h, %[b2].8h, %[a0].h[0]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+
+                "fmla  	v9.8h , %[b0].8h, %[a0].h[1]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "fmla	v17.8h, %[b1].8h, %[a0].h[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "fmla	v25.8h, %[b2].8h, %[a0].h[1]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+
+                "fmla	v10.8h, %[b0].8h, %[a0].h[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v18.8h, %[b1].8h, %[a0].h[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+                "fmla	v26.8h, %[b2].8h, %[a0].h[2]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+
+                "fmla	v11.8h, %[b0].8h, %[a0].h[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v19.8h, %[b1].8h, %[a0].h[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+                "fmla	v27.8h, %[b2].8h, %[a0].h[3]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+
+                "fmla 	v12.8h, %[b0].8h, %[a0].h[4]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v20.8h, %[b1].8h, %[a0].h[4]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+                "fmla	v28.8h, %[b2].8h, %[a0].h[4]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+
+                "fmla  	v13.8h, %[b0].8h, %[a0].h[5]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v21.8h, %[b1].8h, %[a0].h[5]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+                "fmla	v29.8h, %[b2].8h, %[a0].h[5]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+
+                "fmla	v14.8h, %[b0].8h, %[a0].h[6]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v22.8h, %[b1].8h, %[a0].h[6]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+                "fmla	v30.8h, %[b2].8h, %[a0].h[6]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+
+                "fmla	v15.8h, %[b0].8h, %[a0].h[7]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v23.8h, %[b1].8h, %[a0].h[7]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+                "fmla	v31.8h, %[b2].8h, %[a0].h[7]\n"
+
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a0a] "+w" (a0a),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__ && (FP16_KERNELS || __ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
index 1ce934d..4147ab6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #ifdef __aarch64__
 
-
+#include "../performance_parameters.hpp"
 #include "../std_transforms_fixed.hpp"
 
 namespace arm_gemm
@@ -34,6 +34,7 @@
 // Actual kernel implementations
 void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
 void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_hybrid_fp32_mla_16x4_x1(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
 
 class hybrid_fp32_mla_16x4
 {
@@ -59,7 +60,7 @@
         return 1;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }
@@ -74,6 +75,22 @@
         return true;
     }
 
+    static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+        switch (ci->get_cpu_model()) {
+            case CPUModel::A55r1:
+                return { 2.866 };
+
+            case CPUModel::A53:
+                return { 1.419 };
+
+            case CPUModel::A73:
+                return { 2.551 };
+
+            default:
+                return { 6.25 };
+        }
+    }
+
     StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
 
     // Default to the generic kernel
@@ -83,6 +100,8 @@
     {
         if (ci->get_cpu_model() == CPUModel::A55r1) {
             kernel = a64_hybrid_fp32_mla_16x4_a55;
+        } else if (ci->get_cpu_model() == CPUModel::X1) {
+            kernel = a64_hybrid_fp32_mla_16x4_x1;
         }
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
index 5bce632..94fcd10 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
+void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
     const int K_stride = K;
     const long loops_count = ((K + 4) / 8) - 1;
     K -= loops_count * 8;
@@ -40,7 +40,7 @@
     K -= (regs_count + 1) * 4;
     const long blocks_count = K / 1;
     float nullbias[16];
-    if (!append && !bias) {
+    if (!accumulate && !bias) {
         memset(nullbias, 0, (16 * sizeof(float)));
     }
     float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
@@ -61,12 +61,23 @@
             break;
     }
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const float * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(float);
 
         float *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             long loops = loops_count;
@@ -78,7 +89,7 @@
             float result_buffer[64];
             const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
             float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && append) {
+            if (use_result_buffer && accumulate) {
                 for(int cy=0; cy<std::min(M-y, 4); cy++) {
                     for(unsigned int cx=0; cx<width; cx++) {
                         result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
@@ -90,14 +101,14 @@
             }
             const float *biasptr = bias ? bias+x0 : nullbias;
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "temploadreg0 .req X0\n"
                         "temploadreg1 .req X1\n"
                         "temploadreg2 .req X2\n"
                         "temploadreg3 .req X3\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ldr q16, [%[biasptr]]\n"
                         "ldr q17, [%[biasptr], #0x10]\n"
                         "ldr q18, [%[biasptr], #0x20]\n"
@@ -470,7 +481,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -484,7 +495,7 @@
                         "temploadreg3 .req X5\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ldr q16, [%[biasptr]]\n"
                         "ldr q17, [%[biasptr], #0x10]\n"
                         "ldr q18, [%[biasptr], #0x20]\n"
@@ -982,7 +993,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
@@ -1000,7 +1011,7 @@
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
                         "add a_ptr2, a_ptr1, %[lda]\n"
                         "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ldr q16, [%[biasptr]]\n"
                         "ldr q17, [%[biasptr], #0x10]\n"
                         "ldr q18, [%[biasptr], #0x20]\n"
@@ -1623,7 +1634,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                     );
                     break;
@@ -1646,7 +1657,7 @@
                         "add c_ptr2, c_ptr1, %[ldc]\n"
                         "add a_ptr3, a_ptr2, %[lda]\n"
                         "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ldr q16, [%[biasptr]]\n"
                         "ldr q17, [%[biasptr], #0x10]\n"
                         "ldr q18, [%[biasptr], #0x20]\n"
@@ -2395,7 +2406,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
index 03f6588..016bef4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
+void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
     const int K_stride = K;
     const long loops_count = ((K + 4) / 8) - 1;
     K -= loops_count * 8;
@@ -40,7 +40,7 @@
     K -= (regs_count + 1) * 4;
     const long blocks_count = K / 1;
     float nullbias[16];
-    if (!append && !bias) {
+    if (!accumulate && !bias) {
         memset(nullbias, 0, (16 * sizeof(float)));
     }
     float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
@@ -61,12 +61,23 @@
             break;
     }
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const float * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(float);
 
         float *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             long loops = loops_count;
@@ -78,7 +89,7 @@
             float result_buffer[64];
             const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
             float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && append) {
+            if (use_result_buffer && accumulate) {
                 for(int cy=0; cy<std::min(M-y, 4); cy++) {
                     for(unsigned int cx=0; cx<width; cx++) {
                         result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
@@ -90,10 +101,10 @@
             }
             const float *biasptr = bias ? bias+x0 : nullbias;
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ldr q16, [%[biasptr]]\n"
                         "ldr q17, [%[biasptr], #0x10]\n"
                         "ldr q18, [%[biasptr], #0x20]\n"
@@ -323,7 +334,7 @@
                         "str q19, [%[c_ptr0], #0x30]\n"
                         "add %[c_ptr0], %[c_ptr0], #0x40\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                     );
                     break;
@@ -333,7 +344,7 @@
                         "c_ptr1 .req X1\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ldr q16, [%[biasptr]]\n"
                         "ldr q17, [%[biasptr], #0x10]\n"
                         "ldr q18, [%[biasptr], #0x20]\n"
@@ -682,7 +693,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -696,7 +707,7 @@
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
                         "add a_ptr2, a_ptr1, %[lda]\n"
                         "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ldr q16, [%[biasptr]]\n"
                         "ldr q17, [%[biasptr], #0x10]\n"
                         "ldr q18, [%[biasptr], #0x20]\n"
@@ -1164,7 +1175,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -1183,7 +1194,7 @@
                         "add c_ptr2, c_ptr1, %[ldc]\n"
                         "add a_ptr3, a_ptr2, %[lda]\n"
                         "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ldr q16, [%[biasptr]]\n"
                         "ldr q17, [%[biasptr], #0x10]\n"
                         "ldr q18, [%[biasptr], #0x20]\n"
@@ -1770,7 +1781,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
new file mode 100644
index 0000000..3f1df76
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp

@@ -0,0 +1,1810 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_16x4_x1(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
+    const int K_stride = K;
+    const long loops_count = ((K + 4) / 8) - 1;
+    K -= loops_count * 8;
+    const long regs_count = (K / 4) - 1;
+    K -= (regs_count + 1) * 4;
+    const long blocks_count = K / 1;
+    float nullbias[16];
+    if (!accumulate && !bias) {
+        memset(nullbias, 0, (16 * sizeof(float)));
+    }
+    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
+    const float * const minptr = &minval;
+    const float * const maxptr = &maxval;
+
+    switch(act.type)
+    {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            minval = 0.0f;
+            break;
+    }
+
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
+        const float * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(float);
+
+        float *c_ptr0 = C + (y * ldc);
+
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
+        for (int x0=0; x0<N; x0+=16ul) {
+            const long width = std::min((unsigned long)N-x0, 16ul);
+            long loops = loops_count;
+            long regs = regs_count;
+            long blocks = blocks_count;
+            const float *a_ptr0 = a_ptr0_base;
+            const float *b_ptr0 = B + (K_stride * x0);
+            const bool use_result_buffer = (width < 16);
+            float result_buffer[64];
+            const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
+            float *c_ptr_real = c_ptr0;
+            if (use_result_buffer && accumulate) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
+                    }
+                }
+            }
+            if (use_result_buffer) {
+                c_ptr0 = result_buffer;
+            }
+            const float *biasptr = bias ? bias+x0 : nullbias;
+
+            switch(rows_to_compute) {
+                case 1:
+                    __asm __volatile (
+                        "cbnz %[accumulate], 1f\n"
+                        "ldr q16, [%[biasptr]]\n"
+                        "ldr q17, [%[biasptr], #0x10]\n"
+                        "ldr q18, [%[biasptr], #0x20]\n"
+                        "ldr q19, [%[biasptr], #0x30]\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v4.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v4.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v4.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v4.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+                        "fmla v16.4s, v8.4s, v4.s[3]\n"
+                        "fmla v17.4s, v9.4s, v4.s[3]\n"
+                        "fmla v18.4s, v10.4s, v4.s[3]\n"
+                        "fmla v19.4s, v11.4s, v4.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "ld1r {v14.4s}, [%[minptr]]\n"
+                        "ld1r {v15.4s}, [%[maxptr]]\n"
+                        "fmax v16.4s, v16.4s, v14.4s\n"
+                        "fmax v17.4s, v17.4s, v14.4s\n"
+                        "fmax v18.4s, v18.4s, v14.4s\n"
+                        "fmax v19.4s, v19.4s, v14.4s\n"
+                        "fmin v16.4s, v16.4s, v15.4s\n"
+                        "fmin v17.4s, v17.4s, v15.4s\n"
+                        "fmin v18.4s, v18.4s, v15.4s\n"
+                        "fmin v19.4s, v19.4s, v15.4s\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "ldr q16, [%[biasptr]]\n"
+                        "ldr q17, [%[biasptr], #0x10]\n"
+                        "ldr q18, [%[biasptr], #0x20]\n"
+                        "ldr q19, [%[biasptr], #0x30]\n"
+                        "mov v20.16b, v16.16b\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mov v21.16b, v17.16b\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mov v22.16b, v18.16b\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mov v23.16b, v19.16b\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v20.4s, v8.4s, v1.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v21.4s, v9.4s, v1.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v22.4s, v10.4s, v1.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v23.4s, v11.4s, v1.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "fmla v20.4s, v8.4s, v1.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v23.4s, v11.4s, v1.s[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[1]\n"
+                        "fmla v20.4s, v8.4s, v5.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[1]\n"
+                        "fmla v21.4s, v9.4s, v5.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v4.s[1]\n"
+                        "fmla v22.4s, v10.4s, v5.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[1]\n"
+                        "fmla v23.4s, v11.4s, v5.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v4.s[3]\n"
+                        "fmla v20.4s, v8.4s, v5.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[3]\n"
+                        "fmla v21.4s, v9.4s, v5.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v4.s[3]\n"
+                        "fmla v22.4s, v10.4s, v5.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[3]\n"
+                        "fmla v23.4s, v11.4s, v5.s[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "fmla v20.4s, v8.4s, v1.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "fmla v22.4s, v10.4s, v1.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "fmla v23.4s, v11.4s, v1.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "fmla v20.4s, v8.4s, v1.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "fmla v23.4s, v11.4s, v1.s[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[1]\n"
+                        "fmla v20.4s, v8.4s, v5.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[1]\n"
+                        "fmla v21.4s, v9.4s, v5.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v4.s[1]\n"
+                        "fmla v22.4s, v10.4s, v5.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[1]\n"
+                        "fmla v23.4s, v11.4s, v5.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v4.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+                        "fmla v20.4s, v8.4s, v5.s[3]\n"
+                        "fmla v17.4s, v9.4s, v4.s[3]\n"
+                        "fmla v21.4s, v9.4s, v5.s[3]\n"
+                        "fmla v18.4s, v10.4s, v4.s[3]\n"
+                        "fmla v22.4s, v10.4s, v5.s[3]\n"
+                        "fmla v19.4s, v11.4s, v4.s[3]\n"
+                        "fmla v23.4s, v11.4s, v5.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "fmla v20.4s, v8.4s, v1.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "fmla v22.4s, v10.4s, v1.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "fmla v23.4s, v11.4s, v1.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+                        "fmla v20.4s, v8.4s, v1.s[3]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[3]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[3]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "fmla v23.4s, v11.4s, v1.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "ld1r {v14.4s}, [%[minptr]]\n"
+                        "ld1r {v15.4s}, [%[maxptr]]\n"
+                        "fmax v16.4s, v16.4s, v14.4s\n"
+                        "fmax v17.4s, v17.4s, v14.4s\n"
+                        "fmax v18.4s, v18.4s, v14.4s\n"
+                        "fmax v19.4s, v19.4s, v14.4s\n"
+                        "fmin v16.4s, v16.4s, v15.4s\n"
+                        "fmin v17.4s, v17.4s, v15.4s\n"
+                        "fmin v18.4s, v18.4s, v15.4s\n"
+                        "fmin v19.4s, v19.4s, v15.4s\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "fmax v20.4s, v20.4s, v14.4s\n"
+                        "fmax v21.4s, v21.4s, v14.4s\n"
+                        "fmax v22.4s, v22.4s, v14.4s\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "fmax v23.4s, v23.4s, v14.4s\n"
+                        "fmin v20.4s, v20.4s, v15.4s\n"
+                        "fmin v21.4s, v21.4s, v15.4s\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "fmin v22.4s, v22.4s, v15.4s\n"
+                        "fmin v23.4s, v23.4s, v15.4s\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "str q20, [c_ptr1]\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "ldr q16, [%[biasptr]]\n"
+                        "ldr q17, [%[biasptr], #0x10]\n"
+                        "ldr q18, [%[biasptr], #0x20]\n"
+                        "ldr q19, [%[biasptr], #0x30]\n"
+                        "mov v20.16b, v16.16b\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mov v21.16b, v17.16b\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mov v22.16b, v18.16b\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "mov v23.16b, v19.16b\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mov v24.16b, v16.16b\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "mov v25.16b, v17.16b\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "mov v26.16b, v18.16b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov v27.16b, v19.16b\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla v20.4s, v8.4s, v1.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v24.4s, v8.4s, v2.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        "fmla v21.4s, v9.4s, v1.s[1]\n"
+                        "fmla v25.4s, v9.4s, v2.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "fmla v22.4s, v10.4s, v1.s[1]\n"
+                        "fmla v26.4s, v10.4s, v2.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "fmla v23.4s, v11.4s, v1.s[1]\n"
+                        "fmla v27.4s, v11.4s, v2.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "fmla v20.4s, v8.4s, v1.s[3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[3]\n"
+                        "fmla v25.4s, v9.4s, v2.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[3]\n"
+                        "fmla v26.4s, v10.4s, v2.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v23.4s, v11.4s, v1.s[3]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        "fmla v27.4s, v11.4s, v2.s[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q2, [a_ptr2, #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[1]\n"
+                        "fmla v20.4s, v8.4s, v5.s[1]\n"
+                        "fmla v24.4s, v8.4s, v6.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[1]\n"
+                        "fmla v21.4s, v9.4s, v5.s[1]\n"
+                        "fmla v25.4s, v9.4s, v6.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v4.s[1]\n"
+                        "fmla v22.4s, v10.4s, v5.s[1]\n"
+                        "fmla v26.4s, v10.4s, v6.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[1]\n"
+                        "fmla v23.4s, v11.4s, v5.s[1]\n"
+                        "fmla v27.4s, v11.4s, v6.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v4.s[3]\n"
+                        "fmla v20.4s, v8.4s, v5.s[3]\n"
+                        "fmla v24.4s, v8.4s, v6.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[3]\n"
+                        "fmla v21.4s, v9.4s, v5.s[3]\n"
+                        "fmla v25.4s, v9.4s, v6.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v4.s[3]\n"
+                        "fmla v22.4s, v10.4s, v5.s[3]\n"
+                        "fmla v26.4s, v10.4s, v6.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[3]\n"
+                        "fmla v23.4s, v11.4s, v5.s[3]\n"
+                        "fmla v27.4s, v11.4s, v6.s[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "fmla v20.4s, v8.4s, v1.s[1]\n"
+                        "fmla v24.4s, v8.4s, v2.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[1]\n"
+                        "fmla v25.4s, v9.4s, v2.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "fmla v22.4s, v10.4s, v1.s[1]\n"
+                        "fmla v26.4s, v10.4s, v2.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "fmla v23.4s, v11.4s, v1.s[1]\n"
+                        "fmla v27.4s, v11.4s, v2.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "fmla v20.4s, v8.4s, v1.s[3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[3]\n"
+                        "fmla v25.4s, v9.4s, v2.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[3]\n"
+                        "fmla v26.4s, v10.4s, v2.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "fmla v23.4s, v11.4s, v1.s[3]\n"
+                        "fmla v27.4s, v11.4s, v2.s[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[1]\n"
+                        "fmla v20.4s, v8.4s, v5.s[1]\n"
+                        "fmla v24.4s, v8.4s, v6.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[1]\n"
+                        "fmla v21.4s, v9.4s, v5.s[1]\n"
+                        "fmla v25.4s, v9.4s, v6.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v4.s[1]\n"
+                        "fmla v22.4s, v10.4s, v5.s[1]\n"
+                        "fmla v26.4s, v10.4s, v6.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[1]\n"
+                        "fmla v23.4s, v11.4s, v5.s[1]\n"
+                        "fmla v27.4s, v11.4s, v6.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v4.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+                        "fmla v20.4s, v8.4s, v5.s[3]\n"
+                        "fmla v24.4s, v8.4s, v6.s[3]\n"
+                        "fmla v17.4s, v9.4s, v4.s[3]\n"
+                        "fmla v21.4s, v9.4s, v5.s[3]\n"
+                        "fmla v25.4s, v9.4s, v6.s[3]\n"
+                        "fmla v18.4s, v10.4s, v4.s[3]\n"
+                        "fmla v22.4s, v10.4s, v5.s[3]\n"
+                        "fmla v26.4s, v10.4s, v6.s[3]\n"
+                        "fmla v19.4s, v11.4s, v4.s[3]\n"
+                        "fmla v23.4s, v11.4s, v5.s[3]\n"
+                        "fmla v27.4s, v11.4s, v6.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "fmla v20.4s, v8.4s, v1.s[1]\n"
+                        "fmla v24.4s, v8.4s, v2.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[1]\n"
+                        "fmla v25.4s, v9.4s, v2.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "fmla v22.4s, v10.4s, v1.s[1]\n"
+                        "fmla v26.4s, v10.4s, v2.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "fmla v23.4s, v11.4s, v1.s[1]\n"
+                        "fmla v27.4s, v11.4s, v2.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+                        "fmla v20.4s, v8.4s, v1.s[3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[3]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[3]\n"
+                        "fmla v25.4s, v9.4s, v2.s[3]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[3]\n"
+                        "fmla v26.4s, v10.4s, v2.s[3]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "fmla v23.4s, v11.4s, v1.s[3]\n"
+                        "fmla v27.4s, v11.4s, v2.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr s2, [a_ptr2]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x4\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "ld1r {v14.4s}, [%[minptr]]\n"
+                        "ld1r {v15.4s}, [%[maxptr]]\n"
+                        "fmax v16.4s, v16.4s, v14.4s\n"
+                        "fmax v17.4s, v17.4s, v14.4s\n"
+                        "fmax v18.4s, v18.4s, v14.4s\n"
+                        "fmax v19.4s, v19.4s, v14.4s\n"
+                        "fmin v16.4s, v16.4s, v15.4s\n"
+                        "fmin v17.4s, v17.4s, v15.4s\n"
+                        "fmin v18.4s, v18.4s, v15.4s\n"
+                        "fmin v19.4s, v19.4s, v15.4s\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "fmax v20.4s, v20.4s, v14.4s\n"
+                        "fmax v21.4s, v21.4s, v14.4s\n"
+                        "fmax v22.4s, v22.4s, v14.4s\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "fmax v23.4s, v23.4s, v14.4s\n"
+                        "fmin v20.4s, v20.4s, v15.4s\n"
+                        "fmin v21.4s, v21.4s, v15.4s\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "fmin v22.4s, v22.4s, v15.4s\n"
+                        "fmin v23.4s, v23.4s, v15.4s\n"
+                        "fmax v24.4s, v24.4s, v14.4s\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "fmax v25.4s, v25.4s, v14.4s\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "fmax v26.4s, v26.4s, v14.4s\n"
+                        "str q20, [c_ptr1]\n"
+                        "fmin v24.4s, v24.4s, v15.4s\n"
+                        "fmin v25.4s, v25.4s, v15.4s\n"
+                        "fmax v27.4s, v27.4s, v14.4s\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "fmin v26.4s, v26.4s, v15.4s\n"
+                        "fmin v27.4s, v27.4s, v15.4s\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "str q24, [c_ptr2]\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "ldr q16, [%[biasptr]]\n"
+                        "ldr q17, [%[biasptr], #0x10]\n"
+                        "ldr q18, [%[biasptr], #0x20]\n"
+                        "ldr q19, [%[biasptr], #0x30]\n"
+                        "mov v20.16b, v16.16b\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "mov v21.16b, v17.16b\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "mov v22.16b, v18.16b\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "mov v23.16b, v19.16b\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "mov v24.16b, v16.16b\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "mov v25.16b, v17.16b\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "mov v26.16b, v18.16b\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "mov v27.16b, v19.16b\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "mov v28.16b, v16.16b\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "mov v29.16b, v17.16b\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "mov v30.16b, v18.16b\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov v31.16b, v19.16b\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ldr q16, [%[c_ptr0]]\n"
+                        "ldr q17, [%[c_ptr0], #0x10]\n"
+                        "ldr q18, [%[c_ptr0], #0x20]\n"
+                        "ldr q19, [%[c_ptr0], #0x30]\n"
+                        "ldr q20, [c_ptr1]\n"
+                        "ldr q21, [c_ptr1, #0x10]\n"
+                        "ldr q22, [c_ptr1, #0x20]\n"
+                        "ldr q23, [c_ptr1, #0x30]\n"
+                        "ldr q24, [c_ptr2]\n"
+                        "ldr q25, [c_ptr2, #0x10]\n"
+                        "ldr q26, [c_ptr2, #0x20]\n"
+                        "ldr q27, [c_ptr2, #0x30]\n"
+                        "ldr q28, [c_ptr3]\n"
+                        "ldr q29, [c_ptr3, #0x10]\n"
+                        "ldr q30, [c_ptr3, #0x20]\n"
+                        "ldr q31, [c_ptr3, #0x30]\n"
+                        "ldr q0, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ldr q1, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "ldr q2, [a_ptr2]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ldr q3, [a_ptr3]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
+                        "fmla v20.4s, v8.4s, v1.s[1]\n"
+                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
+                        "fmla v24.4s, v8.4s, v2.s[1]\n"
+                        "fmla v28.4s, v8.4s, v3.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[1]\n"
+                        "fmla v25.4s, v9.4s, v2.s[1]\n"
+                        "fmla v29.4s, v9.4s, v3.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "fmla v22.4s, v10.4s, v1.s[1]\n"
+                        "fmla v26.4s, v10.4s, v2.s[1]\n"
+                        "fmla v30.4s, v10.4s, v3.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "fmla v23.4s, v11.4s, v1.s[1]\n"
+                        "fmla v27.4s, v11.4s, v2.s[1]\n"
+                        "fmla v31.4s, v11.4s, v3.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "fmla v20.4s, v8.4s, v1.s[3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[3]\n"
+                        "fmla v28.4s, v8.4s, v3.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[3]\n"
+                        "fmla v25.4s, v9.4s, v2.s[3]\n"
+                        "fmla v29.4s, v9.4s, v3.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[3]\n"
+                        "fmla v26.4s, v10.4s, v2.s[3]\n"
+                        "fmla v30.4s, v10.4s, v3.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "ldr q0, [%[a_ptr0], #-0x10]\n"
+                        "fmla v23.4s, v11.4s, v1.s[3]\n"
+                        "ldr q1, [a_ptr1, #-0x10]\n"
+                        "fmla v27.4s, v11.4s, v2.s[3]\n"
+                        "ldr q2, [a_ptr2, #-0x10]\n"
+                        "fmla v31.4s, v11.4s, v3.s[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "ldr q3, [a_ptr3, #-0x10]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "fmla v28.4s, v8.4s, v7.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "fmla v29.4s, v9.4s, v7.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "fmla v30.4s, v10.4s, v7.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "fmla v31.4s, v11.4s, v7.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[1]\n"
+                        "fmla v20.4s, v8.4s, v5.s[1]\n"
+                        "fmla v24.4s, v8.4s, v6.s[1]\n"
+                        "fmla v28.4s, v8.4s, v7.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[1]\n"
+                        "fmla v21.4s, v9.4s, v5.s[1]\n"
+                        "fmla v25.4s, v9.4s, v6.s[1]\n"
+                        "fmla v29.4s, v9.4s, v7.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v4.s[1]\n"
+                        "fmla v22.4s, v10.4s, v5.s[1]\n"
+                        "fmla v26.4s, v10.4s, v6.s[1]\n"
+                        "fmla v30.4s, v10.4s, v7.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[1]\n"
+                        "fmla v23.4s, v11.4s, v5.s[1]\n"
+                        "fmla v27.4s, v11.4s, v6.s[1]\n"
+                        "fmla v31.4s, v11.4s, v7.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "fmla v28.4s, v8.4s, v7.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "fmla v29.4s, v9.4s, v7.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "fmla v30.4s, v10.4s, v7.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "fmla v31.4s, v11.4s, v7.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v4.s[3]\n"
+                        "fmla v20.4s, v8.4s, v5.s[3]\n"
+                        "fmla v24.4s, v8.4s, v6.s[3]\n"
+                        "fmla v28.4s, v8.4s, v7.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[3]\n"
+                        "fmla v21.4s, v9.4s, v5.s[3]\n"
+                        "fmla v25.4s, v9.4s, v6.s[3]\n"
+                        "fmla v29.4s, v9.4s, v7.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v4.s[3]\n"
+                        "fmla v22.4s, v10.4s, v5.s[3]\n"
+                        "fmla v26.4s, v10.4s, v6.s[3]\n"
+                        "fmla v30.4s, v10.4s, v7.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[3]\n"
+                        "fmla v23.4s, v11.4s, v5.s[3]\n"
+                        "fmla v27.4s, v11.4s, v6.s[3]\n"
+                        "fmla v31.4s, v11.4s, v7.s[3]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
+                        "prfm PSTL1KEEP, [c_ptr1]\n"
+                        "prfm PSTL1KEEP, [c_ptr2]\n"
+                        "prfm PSTL1KEEP, [c_ptr3]\n"
+                        "cbz %[regs], 4f\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr q4, [%[a_ptr0]]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "ldr q5, [a_ptr1]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "ldr q6, [a_ptr2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr q7, [a_ptr3]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "fmla v20.4s, v8.4s, v1.s[1]\n"
+                        "fmla v24.4s, v8.4s, v2.s[1]\n"
+                        "fmla v28.4s, v8.4s, v3.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[1]\n"
+                        "fmla v25.4s, v9.4s, v2.s[1]\n"
+                        "fmla v29.4s, v9.4s, v3.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "fmla v22.4s, v10.4s, v1.s[1]\n"
+                        "fmla v26.4s, v10.4s, v2.s[1]\n"
+                        "fmla v30.4s, v10.4s, v3.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "fmla v23.4s, v11.4s, v1.s[1]\n"
+                        "fmla v27.4s, v11.4s, v2.s[1]\n"
+                        "fmla v31.4s, v11.4s, v3.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "fmla v20.4s, v8.4s, v1.s[3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[3]\n"
+                        "fmla v28.4s, v8.4s, v3.s[3]\n"
+                        "ldr q8, [%[b_ptr0], #-0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[3]\n"
+                        "fmla v25.4s, v9.4s, v2.s[3]\n"
+                        "fmla v29.4s, v9.4s, v3.s[3]\n"
+                        "ldr q9, [%[b_ptr0], #-0x30]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[3]\n"
+                        "fmla v26.4s, v10.4s, v2.s[3]\n"
+                        "fmla v30.4s, v10.4s, v3.s[3]\n"
+                        "ldr q10, [%[b_ptr0], #-0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "fmla v23.4s, v11.4s, v1.s[3]\n"
+                        "fmla v27.4s, v11.4s, v2.s[3]\n"
+                        "fmla v31.4s, v11.4s, v3.s[3]\n"
+                        "ldr q11, [%[b_ptr0], #-0x10]\n"
+                        "fmla v16.4s, v8.4s, v4.s[0]\n"
+                        "fmla v20.4s, v8.4s, v5.s[0]\n"
+                        "fmla v24.4s, v8.4s, v6.s[0]\n"
+                        "fmla v28.4s, v8.4s, v7.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v4.s[0]\n"
+                        "fmla v21.4s, v9.4s, v5.s[0]\n"
+                        "fmla v25.4s, v9.4s, v6.s[0]\n"
+                        "fmla v29.4s, v9.4s, v7.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v4.s[0]\n"
+                        "fmla v22.4s, v10.4s, v5.s[0]\n"
+                        "fmla v26.4s, v10.4s, v6.s[0]\n"
+                        "fmla v30.4s, v10.4s, v7.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v4.s[0]\n"
+                        "fmla v23.4s, v11.4s, v5.s[0]\n"
+                        "fmla v27.4s, v11.4s, v6.s[0]\n"
+                        "fmla v31.4s, v11.4s, v7.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v4.s[1]\n"
+                        "fmla v20.4s, v8.4s, v5.s[1]\n"
+                        "fmla v24.4s, v8.4s, v6.s[1]\n"
+                        "fmla v28.4s, v8.4s, v7.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v4.s[1]\n"
+                        "fmla v21.4s, v9.4s, v5.s[1]\n"
+                        "fmla v25.4s, v9.4s, v6.s[1]\n"
+                        "fmla v29.4s, v9.4s, v7.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v4.s[1]\n"
+                        "fmla v22.4s, v10.4s, v5.s[1]\n"
+                        "fmla v26.4s, v10.4s, v6.s[1]\n"
+                        "fmla v30.4s, v10.4s, v7.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[1]\n"
+                        "fmla v23.4s, v11.4s, v5.s[1]\n"
+                        "fmla v27.4s, v11.4s, v6.s[1]\n"
+                        "fmla v31.4s, v11.4s, v7.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v4.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v5.s[2]\n"
+                        "fmla v24.4s, v8.4s, v6.s[2]\n"
+                        "fmla v28.4s, v8.4s, v7.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v4.s[2]\n"
+                        "fmla v21.4s, v9.4s, v5.s[2]\n"
+                        "fmla v25.4s, v9.4s, v6.s[2]\n"
+                        "fmla v29.4s, v9.4s, v7.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v4.s[2]\n"
+                        "fmla v22.4s, v10.4s, v5.s[2]\n"
+                        "fmla v26.4s, v10.4s, v6.s[2]\n"
+                        "fmla v30.4s, v10.4s, v7.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v4.s[2]\n"
+                        "fmla v23.4s, v11.4s, v5.s[2]\n"
+                        "fmla v27.4s, v11.4s, v6.s[2]\n"
+                        "fmla v31.4s, v11.4s, v7.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v4.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+                        "fmla v20.4s, v8.4s, v5.s[3]\n"
+                        "fmla v24.4s, v8.4s, v6.s[3]\n"
+                        "fmla v28.4s, v8.4s, v7.s[3]\n"
+                        "fmla v17.4s, v9.4s, v4.s[3]\n"
+                        "fmla v21.4s, v9.4s, v5.s[3]\n"
+                        "fmla v25.4s, v9.4s, v6.s[3]\n"
+                        "fmla v29.4s, v9.4s, v7.s[3]\n"
+                        "fmla v18.4s, v10.4s, v4.s[3]\n"
+                        "fmla v22.4s, v10.4s, v5.s[3]\n"
+                        "fmla v26.4s, v10.4s, v6.s[3]\n"
+                        "fmla v30.4s, v10.4s, v7.s[3]\n"
+                        "fmla v19.4s, v11.4s, v4.s[3]\n"
+                        "fmla v23.4s, v11.4s, v5.s[3]\n"
+                        "fmla v27.4s, v11.4s, v6.s[3]\n"
+                        "fmla v31.4s, v11.4s, v7.s[3]\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "fmla v16.4s, v8.4s, v0.s[1]\n"
+                        "fmla v20.4s, v8.4s, v1.s[1]\n"
+                        "fmla v24.4s, v8.4s, v2.s[1]\n"
+                        "fmla v28.4s, v8.4s, v3.s[1]\n"
+                        "ldr q8, [%[b_ptr0], #0x40]\n"
+                        "fmla v17.4s, v9.4s, v0.s[1]\n"
+                        "fmla v21.4s, v9.4s, v1.s[1]\n"
+                        "fmla v25.4s, v9.4s, v2.s[1]\n"
+                        "fmla v29.4s, v9.4s, v3.s[1]\n"
+                        "ldr q9, [%[b_ptr0], #0x50]\n"
+                        "fmla v18.4s, v10.4s, v0.s[1]\n"
+                        "fmla v22.4s, v10.4s, v1.s[1]\n"
+                        "fmla v26.4s, v10.4s, v2.s[1]\n"
+                        "fmla v30.4s, v10.4s, v3.s[1]\n"
+                        "ldr q10, [%[b_ptr0], #0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[1]\n"
+                        "fmla v23.4s, v11.4s, v1.s[1]\n"
+                        "fmla v27.4s, v11.4s, v2.s[1]\n"
+                        "fmla v31.4s, v11.4s, v3.s[1]\n"
+                        "ldr q11, [%[b_ptr0], #0x70]\n"
+                        "fmla v16.4s, v8.4s, v0.s[2]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x100\n"
+                        "fmla v20.4s, v8.4s, v1.s[2]\n"
+                        "fmla v24.4s, v8.4s, v2.s[2]\n"
+                        "fmla v28.4s, v8.4s, v3.s[2]\n"
+                        "ldr q8, [%[b_ptr0], #-0x80]\n"
+                        "fmla v17.4s, v9.4s, v0.s[2]\n"
+                        "fmla v21.4s, v9.4s, v1.s[2]\n"
+                        "fmla v25.4s, v9.4s, v2.s[2]\n"
+                        "fmla v29.4s, v9.4s, v3.s[2]\n"
+                        "ldr q9, [%[b_ptr0], #-0x70]\n"
+                        "fmla v18.4s, v10.4s, v0.s[2]\n"
+                        "fmla v22.4s, v10.4s, v1.s[2]\n"
+                        "fmla v26.4s, v10.4s, v2.s[2]\n"
+                        "fmla v30.4s, v10.4s, v3.s[2]\n"
+                        "ldr q10, [%[b_ptr0], #-0x60]\n"
+                        "fmla v19.4s, v11.4s, v0.s[2]\n"
+                        "fmla v23.4s, v11.4s, v1.s[2]\n"
+                        "fmla v27.4s, v11.4s, v2.s[2]\n"
+                        "fmla v31.4s, v11.4s, v3.s[2]\n"
+                        "ldr q11, [%[b_ptr0], #-0x50]\n"
+                        "fmla v16.4s, v8.4s, v0.s[3]\n"
+                        "add %[b_ptr0], %[b_ptr0], #-0x40\n"
+                        "fmla v20.4s, v8.4s, v1.s[3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[3]\n"
+                        "fmla v28.4s, v8.4s, v3.s[3]\n"
+                        "fmla v17.4s, v9.4s, v0.s[3]\n"
+                        "fmla v21.4s, v9.4s, v1.s[3]\n"
+                        "fmla v25.4s, v9.4s, v2.s[3]\n"
+                        "fmla v29.4s, v9.4s, v3.s[3]\n"
+                        "fmla v18.4s, v10.4s, v0.s[3]\n"
+                        "fmla v22.4s, v10.4s, v1.s[3]\n"
+                        "fmla v26.4s, v10.4s, v2.s[3]\n"
+                        "fmla v30.4s, v10.4s, v3.s[3]\n"
+                        "fmla v19.4s, v11.4s, v0.s[3]\n"
+                        "fmla v23.4s, v11.4s, v1.s[3]\n"
+                        "fmla v27.4s, v11.4s, v2.s[3]\n"
+                        "fmla v31.4s, v11.4s, v3.s[3]\n"
+                        "5:\n"
+                        "cbz %[blocks], 6f\n"
+                        "7:\n"
+                        "ldr q8, [%[b_ptr0]]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ldr q9, [%[b_ptr0], #0x10]\n"
+                        "ldr s0, [%[a_ptr0]]\n"
+                        "ldr q10, [%[b_ptr0], #0x20]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
+                        "ldr q11, [%[b_ptr0], #0x30]\n"
+                        "add %[b_ptr0], %[b_ptr0], #0x40\n"
+                        "fmla v16.4s, v8.4s, v0.s[0]\n"
+                        "ldr s1, [a_ptr1]\n"
+                        "fmla v17.4s, v9.4s, v0.s[0]\n"
+                        "add a_ptr1, a_ptr1, #0x4\n"
+                        "fmla v18.4s, v10.4s, v0.s[0]\n"
+                        "ldr s2, [a_ptr2]\n"
+                        "fmla v20.4s, v8.4s, v1.s[0]\n"
+                        "add a_ptr2, a_ptr2, #0x4\n"
+                        "fmla v21.4s, v9.4s, v1.s[0]\n"
+                        "ldr s3, [a_ptr3]\n"
+                        "fmla v24.4s, v8.4s, v2.s[0]\n"
+                        "add a_ptr3, a_ptr3, #0x4\n"
+                        "fmla v25.4s, v9.4s, v2.s[0]\n"
+                        "fmla v28.4s, v8.4s, v3.s[0]\n"
+                        "fmla v29.4s, v9.4s, v3.s[0]\n"
+                        "fmla v22.4s, v10.4s, v1.s[0]\n"
+                        "fmla v26.4s, v10.4s, v2.s[0]\n"
+                        "fmla v30.4s, v10.4s, v3.s[0]\n"
+                        "fmla v19.4s, v11.4s, v0.s[0]\n"
+                        "fmla v23.4s, v11.4s, v1.s[0]\n"
+                        "fmla v27.4s, v11.4s, v2.s[0]\n"
+                        "fmla v31.4s, v11.4s, v3.s[0]\n"
+                        "b.ne 7b\n"
+                        "6:\n"
+                        "ld1r {v14.4s}, [%[minptr]]\n"
+                        "ld1r {v15.4s}, [%[maxptr]]\n"
+                        "fmax v16.4s, v16.4s, v14.4s\n"
+                        "fmax v17.4s, v17.4s, v14.4s\n"
+                        "fmax v18.4s, v18.4s, v14.4s\n"
+                        "fmax v19.4s, v19.4s, v14.4s\n"
+                        "fmin v16.4s, v16.4s, v15.4s\n"
+                        "fmin v17.4s, v17.4s, v15.4s\n"
+                        "fmin v18.4s, v18.4s, v15.4s\n"
+                        "fmin v19.4s, v19.4s, v15.4s\n"
+                        "str q16, [%[c_ptr0]]\n"
+                        "fmax v20.4s, v20.4s, v14.4s\n"
+                        "fmax v21.4s, v21.4s, v14.4s\n"
+                        "fmax v22.4s, v22.4s, v14.4s\n"
+                        "str q17, [%[c_ptr0], #0x10]\n"
+                        "fmax v23.4s, v23.4s, v14.4s\n"
+                        "fmin v20.4s, v20.4s, v15.4s\n"
+                        "fmin v21.4s, v21.4s, v15.4s\n"
+                        "str q18, [%[c_ptr0], #0x20]\n"
+                        "fmin v22.4s, v22.4s, v15.4s\n"
+                        "fmin v23.4s, v23.4s, v15.4s\n"
+                        "fmax v24.4s, v24.4s, v14.4s\n"
+                        "str q19, [%[c_ptr0], #0x30]\n"
+                        "fmax v25.4s, v25.4s, v14.4s\n"
+                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
+                        "fmax v26.4s, v26.4s, v14.4s\n"
+                        "str q20, [c_ptr1]\n"
+                        "fmin v24.4s, v24.4s, v15.4s\n"
+                        "fmin v25.4s, v25.4s, v15.4s\n"
+                        "fmax v27.4s, v27.4s, v14.4s\n"
+                        "str q21, [c_ptr1, #0x10]\n"
+                        "fmin v26.4s, v26.4s, v15.4s\n"
+                        "fmax v28.4s, v28.4s, v14.4s\n"
+                        "fmax v29.4s, v29.4s, v14.4s\n"
+                        "str q22, [c_ptr1, #0x20]\n"
+                        "fmin v27.4s, v27.4s, v15.4s\n"
+                        "fmax v30.4s, v30.4s, v14.4s\n"
+                        "fmin v28.4s, v28.4s, v15.4s\n"
+                        "str q23, [c_ptr1, #0x30]\n"
+                        "fmin v29.4s, v29.4s, v15.4s\n"
+                        "fmax v31.4s, v31.4s, v14.4s\n"
+                        "fmin v30.4s, v30.4s, v15.4s\n"
+                        "str q24, [c_ptr2]\n"
+                        "fmin v31.4s, v31.4s, v15.4s\n"
+                        "str q25, [c_ptr2, #0x10]\n"
+                        "str q26, [c_ptr2, #0x20]\n"
+                        "str q27, [c_ptr2, #0x30]\n"
+                        "str q28, [c_ptr3]\n"
+                        "str q29, [c_ptr3, #0x10]\n"
+                        "str q30, [c_ptr3, #0x20]\n"
+                        "str q31, [c_ptr3, #0x30]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+            }
+            if (use_result_buffer) {
+                for(int cy=0; cy<std::min(M-y, 4); cy++) {
+                    for(unsigned int cx=0; cx<width; cx++) {
+                        c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp
index da5beef..b60401b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 1;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return false;
     }
@@ -78,9 +78,9 @@
     // Default to the generic kernel
     kern_type kernel=a64_hybrid_fp32_mla_4x8;
 
-    hybrid_fp32_mla_4x8(const CPUInfo *ci)
+    hybrid_fp32_mla_4x8(const CPUInfo *)
     {
-        UNUSED(ci);
+
     }
 };
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
index db7eb83..7442d25 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
+void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
     const int K_stride = K;
     const long loops_count = ((K + 4) / 8) - 1;
     K -= loops_count * 8;
@@ -40,7 +40,7 @@
     K -= (regs_count + 1) * 4;
     const long blocks_count = K / 1;
     float nullbias[4];
-    if (!append && !bias) {
+    if (!accumulate && !bias) {
         memset(nullbias, 0, (4 * sizeof(float)));
     }
     float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
@@ -61,12 +61,23 @@
             break;
     }
 
-    for (int y=0; y<M; y+=8) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const float * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(float);
 
         float *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 8) {
+            if (rows_to_compute % 8) {
+                rows_to_compute = 8 - 1;
+            } else {
+                rows_to_compute = 8;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=4ul) {
             const long width = std::min((unsigned long)N-x0, 4ul);
             long loops = loops_count;
@@ -78,7 +89,7 @@
             float result_buffer[32];
             const unsigned long ldcb = (use_result_buffer ? 4 : ldc) * sizeof(float);
             float *c_ptr_real = c_ptr0;
-            if (use_result_buffer && append) {
+            if (use_result_buffer && accumulate) {
                 for(int cy=0; cy<std::min(M-y, 8); cy++) {
                     for(unsigned int cx=0; cx<width; cx++) {
                         result_buffer[cy * 4 + cx] = c_ptr_real[cy * ldc + cx];
@@ -90,7 +101,7 @@
             }
             const float *biasptr = bias ? bias+x0 : nullbias;
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "ldr q24, [%[biasptr]]\n"
@@ -168,7 +179,7 @@
                         "str q24, [%[c_ptr0]]\n"
                         "add %[c_ptr0], %[c_ptr0], #0x10\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                     );
                     break;
@@ -291,7 +302,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -456,7 +467,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -663,7 +674,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
@@ -911,7 +922,7 @@
                         ".unreq c_ptr3\n"
                         ".unreq c_ptr4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                     );
                     break;
@@ -1200,7 +1211,7 @@
                         ".unreq c_ptr4\n"
                         ".unreq c_ptr5\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                     );
                     break;
@@ -1530,7 +1541,7 @@
                         ".unreq c_ptr5\n"
                         ".unreq c_ptr6\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
                     );
                     break;
@@ -1902,7 +1913,7 @@
                         ".unreq c_ptr6\n"
                         ".unreq c_ptr7\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
index bdc62ea..a23101a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,7 +59,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
index 7c08aa2..4a7cdc5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,9 +32,7 @@
 
 namespace arm_gemm {
 
-void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool append) {
-    UNUSED(bias);
-    UNUSED(act);
+void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
     const int K_stride = ((K + 3) / 4) * 4;
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
@@ -43,12 +41,23 @@
     const long blocks_count = K / 4;
     const long odds_count = K - (blocks_count * 4);
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const int8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(int8_t);
 
         int32_t *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             long loops = loops_count;
@@ -61,7 +70,7 @@
             int32_t result_buffer[64];
             const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
             int32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && append) {
+            if (use_result_buffer && accumulate) {
                 for(int cy=0; cy<std::min(M-y, 4); cy++) {
                     for(unsigned int cx=0; cx<width; cx++) {
                         result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
@@ -72,14 +81,14 @@
                 c_ptr0 = result_buffer;
             }
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "temploadreg0 .req X0\n"
                         "temploadreg1 .req X1\n"
                         "temploadreg2 .req X2\n"
                         "temploadreg3 .req X3\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -460,7 +469,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -474,7 +483,7 @@
                         "temploadreg3 .req X5\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -979,7 +988,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
@@ -997,7 +1006,7 @@
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
                         "add a_ptr2, a_ptr1, %[lda]\n"
                         "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -1627,7 +1636,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                     );
                     break;
@@ -1650,7 +1659,7 @@
                         "add c_ptr2, c_ptr1, %[ldc]\n"
                         "add a_ptr3, a_ptr2, %[lda]\n"
                         "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -2404,7 +2413,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
index 9f06a48..da39a32 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,9 +32,7 @@
 
 namespace arm_gemm {
 
-void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool append) {
-    UNUSED(bias);
-    UNUSED(act);
+void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
     const int K_stride = ((K + 3) / 4) * 4;
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
@@ -43,12 +41,23 @@
     const long blocks_count = K / 4;
     const long odds_count = K - (blocks_count * 4);
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const int8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(int8_t);
 
         int32_t *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             long loops = loops_count;
@@ -61,7 +70,7 @@
             int32_t result_buffer[64];
             const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
             int32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && append) {
+            if (use_result_buffer && accumulate) {
                 for(int cy=0; cy<std::min(M-y, 4); cy++) {
                     for(unsigned int cx=0; cx<width; cx++) {
                         result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
@@ -72,10 +81,10 @@
                 c_ptr0 = result_buffer;
             }
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -313,7 +322,7 @@
                         "str q19, [%[c_ptr0], #0x30]\n"
                         "add %[c_ptr0], %[c_ptr0], #0x40\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                     );
                     break;
@@ -323,7 +332,7 @@
                         "c_ptr1 .req X1\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -679,7 +688,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -693,7 +702,7 @@
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
                         "add a_ptr2, a_ptr1, %[lda]\n"
                         "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -1167,7 +1176,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -1186,7 +1195,7 @@
                         "add c_ptr2, c_ptr1, %[ldc]\n"
                         "add a_ptr3, a_ptr2, %[lda]\n"
                         "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -1778,7 +1787,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
index 5295650..e5a88b4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,7 +59,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
index e8ed0c3..735e5fd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,10 +32,7 @@
 
 namespace arm_gemm {
 
-void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool append) {
-    UNUSED(bias);
-    UNUSED(act);
-
+void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
     const int K_stride = ((K + 3) / 4) * 4;
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
@@ -44,12 +41,23 @@
     const long blocks_count = K / 4;
     const long odds_count = K - (blocks_count * 4);
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const uint8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(uint8_t);
 
         uint32_t *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             long loops = loops_count;
@@ -62,7 +70,7 @@
             uint32_t result_buffer[64];
             const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
             uint32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && append) {
+            if (use_result_buffer && accumulate) {
                 for(int cy=0; cy<std::min(M-y, 4); cy++) {
                     for(unsigned int cx=0; cx<width; cx++) {
                         result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
@@ -73,14 +81,14 @@
                 c_ptr0 = result_buffer;
             }
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "temploadreg0 .req X0\n"
                         "temploadreg1 .req X1\n"
                         "temploadreg2 .req X2\n"
                         "temploadreg3 .req X3\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -461,7 +469,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -475,7 +483,7 @@
                         "temploadreg3 .req X5\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -980,7 +988,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
@@ -998,7 +1006,7 @@
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
                         "add a_ptr2, a_ptr1, %[lda]\n"
                         "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -1628,7 +1636,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                     );
                     break;
@@ -1651,7 +1659,7 @@
                         "add c_ptr2, c_ptr1, %[ldc]\n"
                         "add a_ptr3, a_ptr2, %[lda]\n"
                         "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -2405,7 +2413,7 @@
                         ".unreq temploadreg2\n"
                         ".unreq temploadreg3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
index 23d919a..2e86233 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,10 +32,7 @@
 
 namespace arm_gemm {
 
-void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool append) {
-    UNUSED(bias);
-    UNUSED(act);
-
+void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
     const int K_stride = ((K + 3) / 4) * 4;
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
@@ -44,12 +41,23 @@
     const long blocks_count = K / 4;
     const long odds_count = K - (blocks_count * 4);
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const uint8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(uint8_t);
 
         uint32_t *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=16ul) {
             const long width = std::min((unsigned long)N-x0, 16ul);
             long loops = loops_count;
@@ -62,7 +70,7 @@
             uint32_t result_buffer[64];
             const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
             uint32_t *c_ptr_real = c_ptr0;
-            if (use_result_buffer && append) {
+            if (use_result_buffer && accumulate) {
                 for(int cy=0; cy<std::min(M-y, 4); cy++) {
                     for(unsigned int cx=0; cx<width; cx++) {
                         result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
@@ -73,10 +81,10 @@
                 c_ptr0 = result_buffer;
             }
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -314,7 +322,7 @@
                         "str q19, [%[c_ptr0], #0x30]\n"
                         "add %[c_ptr0], %[c_ptr0], #0x40\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
                     );
                     break;
@@ -324,7 +332,7 @@
                         "c_ptr1 .req X1\n"
                         "add a_ptr1, %[a_ptr0], %[lda]\n"
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -680,7 +688,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -694,7 +702,7 @@
                         "add c_ptr1, %[c_ptr0], %[ldc]\n"
                         "add a_ptr2, a_ptr1, %[lda]\n"
                         "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -1168,7 +1176,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -1187,7 +1195,7 @@
                         "add c_ptr2, c_ptr1, %[ldc]\n"
                         "add a_ptr3, a_ptr2, %[lda]\n"
                         "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "movi v16.4s, #0\n"
                         "ldr q0, [%[a_ptr0]]\n"
                         "movi v17.4s, #0\n"
@@ -1779,7 +1787,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
                         : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp
index 0f6c345..95fed86 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,7 @@
 
 // Actual kernel implementations
 void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *, const bfloat16 *, float *, int, int, int);
 
 class interleaved_bf16fp32_dot_12x8 {
 public:
@@ -61,7 +62,12 @@
 
     kern_type kernel=a64_interleaved_bf16fp32_dot_12x8;
 
-    interleaved_bf16fp32_dot_12x8(const CPUInfo *ci) { UNUSED(ci); }
+    interleaved_bf16fp32_dot_12x8(const CPUInfo *ci)
+    {
+        if (ci->get_cpu_model() == CPUModel::X1) {
+            kernel = a64_interleaved_bf16fp32_dot_12x8_x1;
+        }
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
index 8ce6a60..7ffae52 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,13 +57,11 @@
                 "movi v12.4s, #0\n"
                 "ldr q2, [%[a_ptr], #0x20]\n"
                 "movi v13.4s, #0\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
-                "movi v14.4s, #0\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                "movi v15.4s, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
-                "movi v16.4s, #0\n"
+                "movi v14.4s, #0\n"
                 "add %[b_ptr], %[b_ptr], #0x30\n"
+                "movi v15.4s, #0\n"
+                "movi v16.4s, #0\n"
                 "movi v17.4s, #0\n"
                 "movi v18.4s, #0\n"
                 "movi v19.4s, #0\n"
@@ -82,9 +80,11 @@
                 "cbz %[loops], 1f\n"
                 "2:\n"
                 ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                "subs %[loops], %[loops], #0x1\n"
+                "ldr q6, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
+                "subs %[loops], %[loops], #0x1\n"
                 ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
                 ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
                 ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
@@ -140,13 +140,13 @@
                 ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
                 ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
                 ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
-                "ldr q6, [%[b_ptr], #-0x10]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
                 "b.ne 2b\n"
                 "1:\n"
                 "cbz %[tails], 3f\n"
                 ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
+                "ldr q6, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
                 ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
                 ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
@@ -178,12 +178,13 @@
                 ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
                 "add %[a_ptr], %[a_ptr], #0x20\n"
                 ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
+                "add %[b_ptr], %[b_ptr], #0x60\n"
                 ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
                 ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
                 ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
                 ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"
                 ".inst 0x4f63f897 // bfdot v23.4s, v4.8h, v3.h[3]\n"
-                "ldr q4, [%[b_ptr], #0x30]\n"
+                "ldr q4, [%[b_ptr], #-0x30]\n"
                 ".inst 0x4f42f0ac // bfdot v12.4s, v5.8h, v2.h[0]\n"
                 ".inst 0x4f62f0ad // bfdot v13.4s, v5.8h, v2.h[1]\n"
                 ".inst 0x4f42f8ae // bfdot v14.4s, v5.8h, v2.h[2]\n"
@@ -192,7 +193,7 @@
                 ".inst 0x4f63f0b9 // bfdot v25.4s, v5.8h, v3.h[1]\n"
                 ".inst 0x4f43f8ba // bfdot v26.4s, v5.8h, v3.h[2]\n"
                 ".inst 0x4f63f8bb // bfdot v27.4s, v5.8h, v3.h[3]\n"
-                "ldr q5, [%[b_ptr], #0x40]\n"
+                "ldr q5, [%[b_ptr], #-0x20]\n"
                 ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
                 ".inst 0x4f62f0d1 // bfdot v17.4s, v6.8h, v2.h[1]\n"
                 ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
@@ -201,13 +202,12 @@
                 ".inst 0x4f63f0dd // bfdot v29.4s, v6.8h, v3.h[1]\n"
                 ".inst 0x4f43f8de // bfdot v30.4s, v6.8h, v3.h[2]\n"
                 ".inst 0x4f63f8df // bfdot v31.4s, v6.8h, v3.h[3]\n"
-                "ldr q6, [%[b_ptr], #0x50]\n"
+                "ldr q6, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
-                "add %[b_ptr], %[b_ptr], #0x60\n"
                 ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
                 ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
-                "str q8, [%[c_ptr]]\n"
                 ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
+                "str q8, [%[c_ptr]]\n"
                 ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
                 ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
                 ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
@@ -234,14 +234,17 @@
                 "b 4f\n"
                 "3:\n"
                 ".inst 0x4f40f088 // bfdot v8.4s, v4.8h, v0.h[0]\n"
+                "ldr q6, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4f60f089 // bfdot v9.4s, v4.8h, v0.h[1]\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x4f40f88a // bfdot v10.4s, v4.8h, v0.h[2]\n"
+                "add %[b_ptr], %[b_ptr], #0x30\n"
                 ".inst 0x4f60f88b // bfdot v11.4s, v4.8h, v0.h[3]\n"
                 ".inst 0x4f41f094 // bfdot v20.4s, v4.8h, v1.h[0]\n"
                 ".inst 0x4f61f095 // bfdot v21.4s, v4.8h, v1.h[1]\n"
                 ".inst 0x4f41f896 // bfdot v22.4s, v4.8h, v1.h[2]\n"
                 ".inst 0x4f61f897 // bfdot v23.4s, v4.8h, v1.h[3]\n"
-                "ldr q4, [%[b_ptr]]\n"
+                "ldr q4, [%[b_ptr], #-0x30]\n"
                 ".inst 0x4f40f0ac // bfdot v12.4s, v5.8h, v0.h[0]\n"
                 ".inst 0x4f60f0ad // bfdot v13.4s, v5.8h, v0.h[1]\n"
                 ".inst 0x4f40f8ae // bfdot v14.4s, v5.8h, v0.h[2]\n"
@@ -250,7 +253,7 @@
                 ".inst 0x4f61f0b9 // bfdot v25.4s, v5.8h, v1.h[1]\n"
                 ".inst 0x4f41f8ba // bfdot v26.4s, v5.8h, v1.h[2]\n"
                 ".inst 0x4f61f8bb // bfdot v27.4s, v5.8h, v1.h[3]\n"
-                "ldr q5, [%[b_ptr], #0x10]\n"
+                "ldr q5, [%[b_ptr], #-0x20]\n"
                 ".inst 0x4f40f0d0 // bfdot v16.4s, v6.8h, v0.h[0]\n"
                 ".inst 0x4f60f0d1 // bfdot v17.4s, v6.8h, v0.h[1]\n"
                 ".inst 0x4f40f8d2 // bfdot v18.4s, v6.8h, v0.h[2]\n"
@@ -259,13 +262,12 @@
                 ".inst 0x4f61f0dd // bfdot v29.4s, v6.8h, v1.h[1]\n"
                 ".inst 0x4f41f8de // bfdot v30.4s, v6.8h, v1.h[2]\n"
                 ".inst 0x4f61f8df // bfdot v31.4s, v6.8h, v1.h[3]\n"
-                "ldr q6, [%[b_ptr], #0x20]\n"
+                "ldr q6, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4f42f088 // bfdot v8.4s, v4.8h, v2.h[0]\n"
-                "add %[b_ptr], %[b_ptr], #0x30\n"
                 ".inst 0x4f62f089 // bfdot v9.4s, v4.8h, v2.h[1]\n"
                 ".inst 0x4f42f88a // bfdot v10.4s, v4.8h, v2.h[2]\n"
-                "str q8, [%[c_ptr]]\n"
                 ".inst 0x4f62f88b // bfdot v11.4s, v4.8h, v2.h[3]\n"
+                "str q8, [%[c_ptr]]\n"
                 ".inst 0x4f43f094 // bfdot v20.4s, v4.8h, v3.h[0]\n"
                 ".inst 0x4f63f095 // bfdot v21.4s, v4.8h, v3.h[1]\n"
                 ".inst 0x4f43f896 // bfdot v22.4s, v4.8h, v3.h[2]\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
new file mode 100644
index 0000000..58a5143
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp

@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "../../bfloat.hpp"
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    const bfloat16 *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    K /= 2;
+    const long loops_count = (K / 2) - 1;
+    const long tails_count = K % 2;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const bfloat16 *a_ptr0 = a_ptr;
+        const bfloat16 *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            long loops = loops_count;
+            long tails = tails_count;
+
+            __asm __volatile (
+                "movi v8.4s, #0\n"
+                "ldr q0, [%[a_ptr]]\n"
+                "movi v9.4s, #0\n"
+                "ldr q2, [%[b_ptr]]\n"
+                "movi v10.4s, #0\n"
+                "ldr q1, [%[a_ptr], #0x10]\n"
+                "movi v11.4s, #0\n"
+                "ldr q3, [%[b_ptr], #0x10]\n"
+                "movi v12.4s, #0\n"
+                "ldr q4, [%[b_ptr], #0x20]\n"
+                "movi v13.4s, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                "movi v14.4s, #0\n"
+                "add %[b_ptr], %[b_ptr], #0x30\n"
+                "movi v15.4s, #0\n"
+                "movi v16.4s, #0\n"
+                "movi v17.4s, #0\n"
+                "movi v18.4s, #0\n"
+                "movi v19.4s, #0\n"
+                "movi v20.4s, #0\n"
+                "movi v21.4s, #0\n"
+                "movi v22.4s, #0\n"
+                "movi v23.4s, #0\n"
+                "movi v24.4s, #0\n"
+                "movi v25.4s, #0\n"
+                "movi v26.4s, #0\n"
+                "movi v27.4s, #0\n"
+                "movi v28.4s, #0\n"
+                "movi v29.4s, #0\n"
+                "movi v30.4s, #0\n"
+                "movi v31.4s, #0\n"
+                "cbz %[loops], 1f\n"
+                "2:\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "subs %[loops], %[loops], #0x1\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr]]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #0x10]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr]]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #0x20]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                "add %[b_ptr], %[b_ptr], #0x60\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr], #-0x30]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #-0x20]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr], #-0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                "b.ne 2b\n"
+                "1:\n"
+                "cbz %[tails], 3f\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr]]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #0x10]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr]]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #0x20]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                "add %[b_ptr], %[b_ptr], #0x60\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr], #-0x30]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #-0x20]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr], #-0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                "str q8, [%[c_ptr]]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                "str q12, [%[c_ptr], #0x10]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "str q16, [%[c_ptr], #0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                "str q9, [%[c_ptr], #0x30]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "b 4f\n"
+                "3:\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "add %[a_ptr], %[a_ptr], #0x20\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                "add %[b_ptr], %[b_ptr], #0x30\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                "ldr q2, [%[b_ptr], #-0x30]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                "ldr q3, [%[b_ptr], #-0x20]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "ldr q0, [%[a_ptr], #-0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "ldr q4, [%[b_ptr], #-0x10]\n"
+                ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
+                "ldr q1, [%[a_ptr], #-0x10]\n"
+                ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
+                ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
+                ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
+                "str q8, [%[c_ptr]]\n"
+                ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
+                ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
+                ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
+                ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
+                ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
+                ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
+                ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
+                ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
+                "str q12, [%[c_ptr], #0x10]\n"
+                ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
+                ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
+                ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
+                ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
+                ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
+                ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
+                ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
+                ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
+                "str q16, [%[c_ptr], #0x20]\n"
+                ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
+                ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
+                ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
+                "str q9, [%[c_ptr], #0x30]\n"
+                ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
+                "4:\n"
+                "str q13, [%[c_ptr], #0x40]\n"
+                "str q17, [%[c_ptr], #0x50]\n"
+                "str q10, [%[c_ptr], #0x60]\n"
+                "str q14, [%[c_ptr], #0x70]\n"
+                "str q18, [%[c_ptr], #0x80]\n"
+                "str q11, [%[c_ptr], #0x90]\n"
+                "str q15, [%[c_ptr], #0xa0]\n"
+                "str q19, [%[c_ptr], #0xb0]\n"
+                "str q20, [%[c_ptr], #0xc0]\n"
+                "str q24, [%[c_ptr], #0xd0]\n"
+                "str q28, [%[c_ptr], #0xe0]\n"
+                "str q21, [%[c_ptr], #0xf0]\n"
+                "str q25, [%[c_ptr], #0x100]\n"
+                "str q29, [%[c_ptr], #0x110]\n"
+                "str q22, [%[c_ptr], #0x120]\n"
+                "str q26, [%[c_ptr], #0x130]\n"
+                "str q30, [%[c_ptr], #0x140]\n"
+                "str q23, [%[c_ptr], #0x150]\n"
+                "str q27, [%[c_ptr], #0x160]\n"
+                "str q31, [%[c_ptr], #0x170]\n"
+                "add %[c_ptr], %[c_ptr], #0x180\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [loops] "+r" (loops), [tails] "+r" (tails)
+            :
+            : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
+            );
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp
index 7f928fa..7fac599 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,10 @@
 
     kern_type kernel=a64_interleaved_bf16fp32_mmla_12x8;
 
-    interleaved_bf16fp32_mmla_12x8(const CPUInfo *ci) { UNUSED(ci); }
+    interleaved_bf16fp32_mmla_12x8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp
index be87f44..7f0eff2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,42 +59,65 @@
                 "movi v13.4s, #0\n"
                 "ldr q6, [%[b_ptr], #0x20]\n"
                 "movi v14.4s, #0\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x40]\n"
                 "movi v15.4s, #0\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x40]\n"
                 "movi v16.4s, #0\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x80]\n"
                 "movi v17.4s, #0\n"
-                "add %[b_ptr], %[b_ptr], #0x40\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x80]\n"
                 "movi v18.4s, #0\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0xc0]\n"
                 "movi v19.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0xc0]\n"
                 "movi v20.4s, #0\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x100]\n"
                 "movi v21.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x100]\n"
                 "movi v22.4s, #0\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x140]\n"
                 "movi v23.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x140]\n"
                 "movi v24.4s, #0\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x180]\n"
                 "movi v25.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x180]\n"
                 "movi v26.4s, #0\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n"
                 "movi v27.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x1c0]\n"
                 "movi v28.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
                 "movi v29.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
                 "movi v30.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
                 "movi v31.4s, #0\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "add %[b_ptr], %[b_ptr], #0x40\n"
                 "cbz %[loops], 1f\n"
                 "2:\n"
                 ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
-                "subs %[loops], %[loops], #0x1\n"
+                "ldr q7, [%[b_ptr], #-0x10]\n"
                 ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
+                "subs %[loops], %[loops], #0x1\n"
+                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n"
                 ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
                 "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
                 ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
                 ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
+                "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
                 ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
                 "ldr q5, [%[b_ptr], #0x10]\n"
                 ".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
                 ".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
+                "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
                 ".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
                 ".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
                 "ldr q6, [%[b_ptr], #0x20]\n"
@@ -151,18 +174,18 @@
                 ".inst 0x6e47ec59 // bfmmla v25.4s, v2.8h, v7.8h\n"
                 "ldr q2, [%[a_ptr], #-0x20]\n"
                 ".inst 0x6e47ec7f // bfmmla v31.4s, v3.8h, v7.8h\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
                 "b.ne 2b\n"
                 "1:\n"
                 "cbz %[tails], 3f\n"
                 ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
+                "ldr q7, [%[b_ptr], #-0x10]\n"
                 ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
-                ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
-                "ldr q4, [%[b_ptr]]\n"
                 ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
                 ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
+                ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
+                "ldr q4, [%[b_ptr]]\n"
                 ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
                 ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
                 "ldr q5, [%[b_ptr], #0x10]\n"
@@ -268,13 +291,15 @@
                 "b 4f\n"
                 "3:\n"
                 ".inst 0x6e44ec08 // bfmmla v8.4s, v0.8h, v4.8h\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "ldr q7, [%[b_ptr], #-0x10]\n"
                 ".inst 0x6e44ec2e // bfmmla v14.4s, v1.8h, v4.8h\n"
-                "add %[b_ptr], %[b_ptr], #0x80\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
+                "add %[b_ptr], %[b_ptr], #0x80\n"
                 ".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
                 "ldr q4, [%[b_ptr], #-0x80]\n"
-                ".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
                 ".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
                 ".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
                 ".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp
index f669b87..7bfb229 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,9 +61,9 @@
 
     kern_type kernel=a64_interleaved_s8s32_mmla_12x8;
 
-    interleaved_s8s32_mmla_12x8(const CPUInfo *ci)
+    interleaved_s8s32_mmla_12x8(const CPUInfo *)
     {
-        UNUSED(ci);
+
     }
 };
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp
index 49dbdb8..7953510 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,13 +59,11 @@
                 "movi v13.4s, #0\n"
                 "ldr q6, [%[b_ptr], #0x20]\n"
                 "movi v14.4s, #0\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                "movi v15.4s, #0\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                "movi v16.4s, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
-                "movi v17.4s, #0\n"
+                "movi v15.4s, #0\n"
                 "add %[b_ptr], %[b_ptr], #0x40\n"
+                "movi v16.4s, #0\n"
+                "movi v17.4s, #0\n"
                 "movi v18.4s, #0\n"
                 "movi v19.4s, #0\n"
                 "movi v20.4s, #0\n"
@@ -83,12 +81,14 @@
                 "cbz %[loops], 1f\n"
                 "2:\n"
                 ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
-                "subs %[loops], %[loops], #0x1\n"
+                "ldr q7, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
+                "subs %[loops], %[loops], #0x1\n"
+                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
                 ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
                 "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
                 ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
                 ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
                 ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
@@ -151,18 +151,18 @@
                 ".inst 0x4e87a459 // smmla v25.4s, v2.16b, v7.16b\n"
                 "ldr q2, [%[a_ptr], #-0x20]\n"
                 ".inst 0x4e87a47f // smmla v31.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
                 "b.ne 2b\n"
                 "1:\n"
                 "cbz %[tails], 3f\n"
                 ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
+                "ldr q7, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr]]\n"
                 ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
                 ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
+                ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
+                "ldr q4, [%[b_ptr]]\n"
                 ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
                 ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"
                 "ldr q5, [%[b_ptr], #0x10]\n"
@@ -268,13 +268,15 @@
                 "b 4f\n"
                 "3:\n"
                 ".inst 0x4e84a408 // smmla v8.4s, v0.16b, v4.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "ldr q7, [%[b_ptr], #-0x10]\n"
                 ".inst 0x4e84a42e // smmla v14.4s, v1.16b, v4.16b\n"
-                "add %[b_ptr], %[b_ptr], #0x80\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x4e84a454 // smmla v20.4s, v2.16b, v4.16b\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
+                "add %[b_ptr], %[b_ptr], #0x80\n"
                 ".inst 0x4e84a47a // smmla v26.4s, v3.16b, v4.16b\n"
                 "ldr q4, [%[b_ptr], #-0x80]\n"
-                ".inst 0x4e85a409 // smmla v9.4s, v0.16b, v5.16b\n"
                 ".inst 0x4e85a42f // smmla v15.4s, v1.16b, v5.16b\n"
                 ".inst 0x4e85a455 // smmla v21.4s, v2.16b, v5.16b\n"
                 ".inst 0x4e85a47b // smmla v27.4s, v3.16b, v5.16b\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp
index d66edd8..d493517 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,9 +61,9 @@
 
     kern_type kernel=a64_interleaved_u8u32_mmla_12x8;
 
-    interleaved_u8u32_mmla_12x8(const CPUInfo *ci)
+    interleaved_u8u32_mmla_12x8(const CPUInfo *)
     {
-        UNUSED(ci);
+
     }
 };
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
index e182a42..dcd15f0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,13 +59,11 @@
                 "movi v13.4s, #0\n"
                 "ldr q6, [%[b_ptr], #0x20]\n"
                 "movi v14.4s, #0\n"
-                "ldr q3, [%[a_ptr], #0x30]\n"
-                "movi v15.4s, #0\n"
-                "ldr q7, [%[b_ptr], #0x30]\n"
-                "movi v16.4s, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
-                "movi v17.4s, #0\n"
+                "movi v15.4s, #0\n"
                 "add %[b_ptr], %[b_ptr], #0x40\n"
+                "movi v16.4s, #0\n"
+                "movi v17.4s, #0\n"
                 "movi v18.4s, #0\n"
                 "movi v19.4s, #0\n"
                 "movi v20.4s, #0\n"
@@ -83,12 +81,14 @@
                 "cbz %[loops], 1f\n"
                 "2:\n"
                 ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
-                "subs %[loops], %[loops], #0x1\n"
+                "ldr q7, [%[b_ptr], #-0x10]\n"
                 ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+                "subs %[loops], %[loops], #0x1\n"
+                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
                 ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
                 "ldr q4, [%[b_ptr]]\n"
-                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
                 ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
                 ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
                 ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
@@ -151,18 +151,18 @@
                 ".inst 0x6e87a459 // ummla v25.4s, v2.16b, v7.16b\n"
                 "ldr q2, [%[a_ptr], #-0x20]\n"
                 ".inst 0x6e87a47f // ummla v31.4s, v3.16b, v7.16b\n"
-                "ldr q7, [%[b_ptr], #-0x10]\n"
-                "ldr q3, [%[a_ptr], #-0x10]\n"
                 "b.ne 2b\n"
                 "1:\n"
                 "cbz %[tails], 3f\n"
                 ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
+                "ldr q7, [%[b_ptr], #-0x10]\n"
                 ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
-                ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
-                "ldr q4, [%[b_ptr]]\n"
                 ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
                 ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
+                ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
+                "ldr q4, [%[b_ptr]]\n"
                 ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
                 ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"
                 "ldr q5, [%[b_ptr], #0x10]\n"
@@ -268,13 +268,15 @@
                 "b 4f\n"
                 "3:\n"
                 ".inst 0x6e84a408 // ummla v8.4s, v0.16b, v4.16b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "ldr q7, [%[b_ptr], #-0x10]\n"
                 ".inst 0x6e84a42e // ummla v14.4s, v1.16b, v4.16b\n"
-                "add %[b_ptr], %[b_ptr], #0x80\n"
+                "ldr q3, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6e84a454 // ummla v20.4s, v2.16b, v4.16b\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
+                "add %[b_ptr], %[b_ptr], #0x80\n"
                 ".inst 0x6e84a47a // ummla v26.4s, v3.16b, v4.16b\n"
                 "ldr q4, [%[b_ptr], #-0x80]\n"
-                ".inst 0x6e85a409 // ummla v9.4s, v0.16b, v5.16b\n"
                 ".inst 0x6e85a42f // ummla v15.4s, v1.16b, v5.16b\n"
                 ".inst 0x6e85a455 // ummla v21.4s, v2.16b, v5.16b\n"
                 ".inst 0x6e85a47b // ummla v27.4s, v3.16b, v5.16b\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp
deleted file mode 100644
index a86e8ec..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4.hpp
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void a64_native_fp32_mla_16x4(const float *, int, const float *, int ldb, float *, int, int, int, int, const float *, Activation, bool);
-
-class native_fp32_mla_16x4
-{
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
-
-    static unsigned int out_width()
-    {
-        return 16;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 1;
-    }
-
-    static constexpr bool supports_append()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-
-
-    // Default to the generic kernel
-    kern_type kernel=a64_native_fp32_mla_16x4;
-
-    native_fp32_mla_16x4(const CPUInfo *ci) { UNUSED(ci); }
-};
-
-} // namespace arm_gemm
-
-#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4/generic.cpp
deleted file mode 100644
index 82e7333..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4/generic.cpp
+++ /dev/null

@@ -1,1708 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_native_fp32_mla_16x4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long blocks_count = K / 1;
-    float nullbias[16];
-    if (!append && !bias) {
-        memset(nullbias, 0, (16 * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    for (int y=0; y<M; y+=4) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        for (int x0=0; x0<N; x0+=16ul) {
-            const long width = std::min((unsigned long)N-x0, 16ul);
-            long loops = loops_count;
-            long regs = regs_count;
-            long blocks = blocks_count;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + x0;
-            long ldbb = ldb * sizeof(float);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(M-y) {
-                case 1:
-                    __asm __volatile (
-                        "ldr q16, [%[biasptr]]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [ldb] "r" (ldbb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "str q20, [c_ptr1]\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [ldb] "r" (ldbb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "str q24, [c_ptr2]\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [ldb] "r" (ldbb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "ldr q16, [%[biasptr]]\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "ldr q17, [%[biasptr], #0x10]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov v20.16b, v16.16b\n"
-                        "ldr q18, [%[biasptr], #0x20]\n"
-                        "mov v24.16b, v16.16b\n"
-                        "ldr q19, [%[biasptr], #0x30]\n"
-                        "mov v21.16b, v17.16b\n"
-                        "ldr q0, [%[a_ptr0]]\n"
-                        "mov v25.16b, v17.16b\n"
-                        "ldr q1, [a_ptr1]\n"
-                        "mov v22.16b, v18.16b\n"
-                        "ldr q2, [a_ptr2]\n"
-                        "mov v23.16b, v19.16b\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "mov v26.16b, v18.16b\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "mov v27.16b, v19.16b\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "mov v28.16b, v16.16b\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "mov v29.16b, v17.16b\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "mov v30.16b, v18.16b\n"
-                        "ldr q3, [a_ptr3]\n"
-                        "mov v31.16b, v19.16b\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "ldr q0, [%[a_ptr0], #-0x10]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "ldr q1, [a_ptr1, #-0x10]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "ldr q2, [a_ptr2, #-0x10]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "ldr q3, [a_ptr3, #-0x10]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "fmla v28.4s, v12.4s, v7.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "fmla v29.4s, v13.4s, v7.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "fmla v30.4s, v14.4s, v7.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "fmla v31.4s, v15.4s, v7.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v28.4s, v12.4s, v7.s[3]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v29.4s, v13.4s, v7.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v30.4s, v14.4s, v7.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "fmla v31.4s, v15.4s, v7.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "prfm PSTL1KEEP, [%[c_ptr0]]\n"
-                        "prfm PSTL1KEEP, [c_ptr1]\n"
-                        "prfm PSTL1KEEP, [c_ptr2]\n"
-                        "prfm PSTL1KEEP, [c_ptr3]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q4, [%[a_ptr0]]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr q5, [a_ptr1]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "ldr q6, [a_ptr2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q7, [a_ptr3]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v5.s[0]\n"
-                        "fmla v24.4s, v8.4s, v6.s[0]\n"
-                        "fmla v28.4s, v8.4s, v7.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v4.s[0]\n"
-                        "fmla v21.4s, v9.4s, v5.s[0]\n"
-                        "fmla v25.4s, v9.4s, v6.s[0]\n"
-                        "fmla v29.4s, v9.4s, v7.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v4.s[0]\n"
-                        "fmla v22.4s, v10.4s, v5.s[0]\n"
-                        "fmla v26.4s, v10.4s, v6.s[0]\n"
-                        "fmla v30.4s, v10.4s, v7.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v4.s[0]\n"
-                        "fmla v23.4s, v11.4s, v5.s[0]\n"
-                        "fmla v27.4s, v11.4s, v6.s[0]\n"
-                        "fmla v31.4s, v11.4s, v7.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v5.s[1]\n"
-                        "fmla v24.4s, v12.4s, v6.s[1]\n"
-                        "fmla v28.4s, v12.4s, v7.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v4.s[1]\n"
-                        "fmla v21.4s, v13.4s, v5.s[1]\n"
-                        "fmla v25.4s, v13.4s, v6.s[1]\n"
-                        "fmla v29.4s, v13.4s, v7.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v4.s[1]\n"
-                        "fmla v22.4s, v14.4s, v5.s[1]\n"
-                        "fmla v26.4s, v14.4s, v6.s[1]\n"
-                        "fmla v30.4s, v14.4s, v7.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v4.s[1]\n"
-                        "fmla v23.4s, v15.4s, v5.s[1]\n"
-                        "fmla v27.4s, v15.4s, v6.s[1]\n"
-                        "fmla v31.4s, v15.4s, v7.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v4.s[2]\n"
-                        "fmla v20.4s, v8.4s, v5.s[2]\n"
-                        "fmla v24.4s, v8.4s, v6.s[2]\n"
-                        "fmla v28.4s, v8.4s, v7.s[2]\n"
-                        "fmla v17.4s, v9.4s, v4.s[2]\n"
-                        "fmla v21.4s, v9.4s, v5.s[2]\n"
-                        "fmla v25.4s, v9.4s, v6.s[2]\n"
-                        "fmla v29.4s, v9.4s, v7.s[2]\n"
-                        "fmla v18.4s, v10.4s, v4.s[2]\n"
-                        "fmla v22.4s, v10.4s, v5.s[2]\n"
-                        "fmla v26.4s, v10.4s, v6.s[2]\n"
-                        "fmla v30.4s, v10.4s, v7.s[2]\n"
-                        "fmla v19.4s, v11.4s, v4.s[2]\n"
-                        "fmla v23.4s, v11.4s, v5.s[2]\n"
-                        "fmla v27.4s, v11.4s, v6.s[2]\n"
-                        "fmla v31.4s, v11.4s, v7.s[2]\n"
-                        "fmla v16.4s, v12.4s, v4.s[3]\n"
-                        "fmla v20.4s, v12.4s, v5.s[3]\n"
-                        "fmla v24.4s, v12.4s, v6.s[3]\n"
-                        "fmla v28.4s, v12.4s, v7.s[3]\n"
-                        "fmla v17.4s, v13.4s, v4.s[3]\n"
-                        "fmla v21.4s, v13.4s, v5.s[3]\n"
-                        "fmla v25.4s, v13.4s, v6.s[3]\n"
-                        "fmla v29.4s, v13.4s, v7.s[3]\n"
-                        "fmla v18.4s, v14.4s, v4.s[3]\n"
-                        "fmla v22.4s, v14.4s, v5.s[3]\n"
-                        "fmla v26.4s, v14.4s, v6.s[3]\n"
-                        "fmla v30.4s, v14.4s, v7.s[3]\n"
-                        "fmla v19.4s, v15.4s, v4.s[3]\n"
-                        "fmla v23.4s, v15.4s, v5.s[3]\n"
-                        "fmla v27.4s, v15.4s, v6.s[3]\n"
-                        "fmla v31.4s, v15.4s, v7.s[3]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v12.4s, v0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla v20.4s, v12.4s, v1.s[1]\n"
-                        "fmla v24.4s, v12.4s, v2.s[1]\n"
-                        "fmla v28.4s, v12.4s, v3.s[1]\n"
-                        "ldr q12, [%[b_ptr0]]\n"
-                        "fmla v17.4s, v13.4s, v0.s[1]\n"
-                        "fmla v21.4s, v13.4s, v1.s[1]\n"
-                        "fmla v25.4s, v13.4s, v2.s[1]\n"
-                        "fmla v29.4s, v13.4s, v3.s[1]\n"
-                        "ldr q13, [%[b_ptr0], #0x10]\n"
-                        "fmla v18.4s, v14.4s, v0.s[1]\n"
-                        "fmla v22.4s, v14.4s, v1.s[1]\n"
-                        "fmla v26.4s, v14.4s, v2.s[1]\n"
-                        "fmla v30.4s, v14.4s, v3.s[1]\n"
-                        "ldr q14, [%[b_ptr0], #0x20]\n"
-                        "fmla v19.4s, v15.4s, v0.s[1]\n"
-                        "fmla v23.4s, v15.4s, v1.s[1]\n"
-                        "fmla v27.4s, v15.4s, v2.s[1]\n"
-                        "fmla v31.4s, v15.4s, v3.s[1]\n"
-                        "ldr q15, [%[b_ptr0], #0x30]\n"
-                        "fmla v16.4s, v8.4s, v0.s[2]\n"
-                        "fmla v20.4s, v8.4s, v1.s[2]\n"
-                        "fmla v24.4s, v8.4s, v2.s[2]\n"
-                        "fmla v28.4s, v8.4s, v3.s[2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[2]\n"
-                        "fmla v21.4s, v9.4s, v1.s[2]\n"
-                        "fmla v25.4s, v9.4s, v2.s[2]\n"
-                        "fmla v29.4s, v9.4s, v3.s[2]\n"
-                        "fmla v18.4s, v10.4s, v0.s[2]\n"
-                        "fmla v22.4s, v10.4s, v1.s[2]\n"
-                        "fmla v26.4s, v10.4s, v2.s[2]\n"
-                        "fmla v30.4s, v10.4s, v3.s[2]\n"
-                        "fmla v19.4s, v11.4s, v0.s[2]\n"
-                        "fmla v23.4s, v11.4s, v1.s[2]\n"
-                        "fmla v27.4s, v11.4s, v2.s[2]\n"
-                        "fmla v31.4s, v11.4s, v3.s[2]\n"
-                        "fmla v16.4s, v12.4s, v0.s[3]\n"
-                        "fmla v20.4s, v12.4s, v1.s[3]\n"
-                        "fmla v24.4s, v12.4s, v2.s[3]\n"
-                        "fmla v28.4s, v12.4s, v3.s[3]\n"
-                        "fmla v17.4s, v13.4s, v0.s[3]\n"
-                        "fmla v21.4s, v13.4s, v1.s[3]\n"
-                        "fmla v25.4s, v13.4s, v2.s[3]\n"
-                        "fmla v29.4s, v13.4s, v3.s[3]\n"
-                        "fmla v18.4s, v14.4s, v0.s[3]\n"
-                        "fmla v22.4s, v14.4s, v1.s[3]\n"
-                        "fmla v26.4s, v14.4s, v2.s[3]\n"
-                        "fmla v30.4s, v14.4s, v3.s[3]\n"
-                        "fmla v19.4s, v15.4s, v0.s[3]\n"
-                        "fmla v23.4s, v15.4s, v1.s[3]\n"
-                        "fmla v27.4s, v15.4s, v2.s[3]\n"
-                        "fmla v31.4s, v15.4s, v3.s[3]\n"
-                        "4:\n"
-                        "cbz %[blocks], 5f\n"
-                        "6:\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ldr s0, [%[a_ptr0]]\n"
-                        "ldr q8, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x4\n"
-                        "ldr q9, [%[b_ptr0], #0x10]\n"
-                        "ldr s1, [a_ptr1]\n"
-                        "fmla v16.4s, v8.4s, v0.s[0]\n"
-                        "ldr q10, [%[b_ptr0], #0x20]\n"
-                        "ldr q11, [%[b_ptr0], #0x30]\n"
-                        "add a_ptr1, a_ptr1, #0x4\n"
-                        "fmla v20.4s, v8.4s, v1.s[0]\n"
-                        "ldr s2, [a_ptr2]\n"
-                        "fmla v17.4s, v9.4s, v0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x4\n"
-                        "fmla v21.4s, v9.4s, v1.s[0]\n"
-                        "ldr s3, [a_ptr3]\n"
-                        "fmla v24.4s, v8.4s, v2.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x4\n"
-                        "fmla v25.4s, v9.4s, v2.s[0]\n"
-                        "fmla v28.4s, v8.4s, v3.s[0]\n"
-                        "fmla v29.4s, v9.4s, v3.s[0]\n"
-                        "fmla v18.4s, v10.4s, v0.s[0]\n"
-                        "fmla v22.4s, v10.4s, v1.s[0]\n"
-                        "fmla v26.4s, v10.4s, v2.s[0]\n"
-                        "fmla v30.4s, v10.4s, v3.s[0]\n"
-                        "fmla v19.4s, v11.4s, v0.s[0]\n"
-                        "fmla v23.4s, v11.4s, v1.s[0]\n"
-                        "fmla v27.4s, v11.4s, v2.s[0]\n"
-                        "fmla v31.4s, v11.4s, v3.s[0]\n"
-                        "b.ne 6b\n"
-                        "5:\n"
-                        "ld1r {v14.4s}, [%[minptr]]\n"
-                        "ld1r {v15.4s}, [%[maxptr]]\n"
-                        "fmax v16.4s, v16.4s, v14.4s\n"
-                        "fmax v17.4s, v17.4s, v14.4s\n"
-                        "fmax v18.4s, v18.4s, v14.4s\n"
-                        "fmax v19.4s, v19.4s, v14.4s\n"
-                        "fmin v16.4s, v16.4s, v15.4s\n"
-                        "fmin v17.4s, v17.4s, v15.4s\n"
-                        "fmin v18.4s, v18.4s, v15.4s\n"
-                        "fmin v19.4s, v19.4s, v15.4s\n"
-                        "str q16, [%[c_ptr0]]\n"
-                        "fmax v20.4s, v20.4s, v14.4s\n"
-                        "fmax v21.4s, v21.4s, v14.4s\n"
-                        "fmax v22.4s, v22.4s, v14.4s\n"
-                        "str q17, [%[c_ptr0], #0x10]\n"
-                        "fmax v23.4s, v23.4s, v14.4s\n"
-                        "fmin v20.4s, v20.4s, v15.4s\n"
-                        "fmin v21.4s, v21.4s, v15.4s\n"
-                        "str q18, [%[c_ptr0], #0x20]\n"
-                        "fmin v22.4s, v22.4s, v15.4s\n"
-                        "fmin v23.4s, v23.4s, v15.4s\n"
-                        "fmax v24.4s, v24.4s, v14.4s\n"
-                        "str q19, [%[c_ptr0], #0x30]\n"
-                        "fmax v25.4s, v25.4s, v14.4s\n"
-                        "add %[c_ptr0], %[c_ptr0], #0x40\n"
-                        "fmax v26.4s, v26.4s, v14.4s\n"
-                        "str q20, [c_ptr1]\n"
-                        "fmin v24.4s, v24.4s, v15.4s\n"
-                        "fmin v25.4s, v25.4s, v15.4s\n"
-                        "fmax v27.4s, v27.4s, v14.4s\n"
-                        "str q21, [c_ptr1, #0x10]\n"
-                        "fmin v26.4s, v26.4s, v15.4s\n"
-                        "fmax v28.4s, v28.4s, v14.4s\n"
-                        "fmax v29.4s, v29.4s, v14.4s\n"
-                        "str q22, [c_ptr1, #0x20]\n"
-                        "fmin v27.4s, v27.4s, v15.4s\n"
-                        "fmax v30.4s, v30.4s, v14.4s\n"
-                        "fmin v28.4s, v28.4s, v15.4s\n"
-                        "str q23, [c_ptr1, #0x30]\n"
-                        "fmin v29.4s, v29.4s, v15.4s\n"
-                        "fmax v31.4s, v31.4s, v14.4s\n"
-                        "fmin v30.4s, v30.4s, v15.4s\n"
-                        "str q24, [c_ptr2]\n"
-                        "fmin v31.4s, v31.4s, v15.4s\n"
-                        "str q25, [c_ptr2, #0x10]\n"
-                        "str q26, [c_ptr2, #0x20]\n"
-                        "str q27, [c_ptr2, #0x30]\n"
-                        "str q28, [c_ptr3]\n"
-                        "str q29, [c_ptr3, #0x10]\n"
-                        "str q30, [c_ptr3, #0x20]\n"
-                        "str q31, [c_ptr3, #0x30]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [ldb] "r" (ldbb)
-                        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
index ddc97b4..981ce34 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,6 +34,7 @@
 void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
 void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int);
 void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, int);
 
 // 12x8 SGEMM "strategy" class.
 //
@@ -66,6 +67,22 @@
     // Use the standard fixed size transforms.
     StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
 
+    static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
+        switch (ci->get_cpu_model()) {
+            case CPUModel::A55r1:
+                return { 3.724, 1.416, 1.113 };
+
+            case CPUModel::A53:
+                return { 2.777, 0.987, 0.898 };
+
+            case CPUModel::A73:
+                return { 2.885, 1.429, 1.163 };
+
+            default:
+                return { 6.949, 4.149, 2.826 };
+        }
+    }
+
     kern_type kernel=a64_sgemm_asimd_12x8;
 
     sgemm_12x8(const CPUInfo *ci) {
@@ -83,6 +100,10 @@
                 kernel = a64_sgemm_asimd_12x8_a55r1;
                 break;
 
+            case CPUModel::X1:
+                kernel = a64_sgemm_asimd_12x8_x1;
+                break;
+
             default:
                 /* Generic kernel is initialized by default. */
                 break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
index 2400191..5532485 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
index d9aaee1..e9f071f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
index 114c807..8a6fbac 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
index 9c7495e..48dc467 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp
new file mode 100644
index 0000000..63fdf4d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp

@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include <arm_neon.h>
+
+#include "../../asmlib.hpp"
+
+// Kernel implementation.
+//
+// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
+// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order.
+// Assume that "Cpanel" points to a chunk of C output blocks (each size
+// 12x8), the chunks being arranged in a row major fashion.
+//
+// Note that the intent of this is that either ablocks or bblocks will be 1
+// - this construction allows the output loop to proceed in either order.
+
+namespace arm_gemm {
+
+void a64_sgemm_asimd_12x8_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    const float *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            // Fix up for odd lengths - set a flag if K is odd, but make
+            // sure we round up the iteration count.
+            int oddk = (K & 1);
+            int k = ((K+1)/2) - 1;
+
+            register float32x4_t a0  asm("v0");
+            register float32x4_t a1  asm("v1");
+            register float32x4_t b0  asm("v2");
+            register float32x4_t b1  asm("v3");
+            register float32x4_t b2  asm("v4");
+
+            __asm __volatile (
+                // Initialize result registers, load initial operands, prime prefetches.
+                "movi	v8.4s, #0x0\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "movi	v9.4s, #0x0\n"
+                "ldr	%q[b0], [%[b_ptr]]\n"
+                "movi	v10.4s, #0x0\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "movi	v11.4s, #0x0\n"
+                "ldr	%q[b1], [%[b_ptr], #16]\n"
+                "movi	v12.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #64]")
+                "movi	v13.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #64]")
+                "movi	v14.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #128]")
+                "movi	v15.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #128]")
+                "movi	v16.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #192]")
+                "movi	v17.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #256]")
+                "movi	v18.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #192]")
+                "movi	v19.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #320]")
+                "movi	v20.4s, #0x0\n"
+                ASM_PREFETCH("[%[a_ptr], #256]")
+                "movi	v21.4s, #0x0\n"
+                ASM_PREFETCH("[%[b_ptr], #384]")
+                "movi	v22.4s, #0x0\n"
+                "movi	v23.4s, #0x0\n"
+                "movi	v24.4s, #0x0\n"
+                "movi	v25.4s, #0x0\n"
+                "movi	v26.4s, #0x0\n"
+                "movi	v27.4s, #0x0\n"
+                "movi	v28.4s, #0x0\n"
+                "movi	v29.4s, #0x0\n"
+                "movi	v30.4s, #0x0\n"
+                "movi	v31.4s, #0x0\n"
+
+                // Skip loop if we are doing zero iterations of it.
+                "cbz	%w[k], 4f\n"
+
+                // Loop proper
+                "1:\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla  	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla	v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                ASM_PREFETCH("[%[a_ptr], #320]")
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                ASM_PREFETCH("[%[b_ptr], #448]")
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "ldr	%q[a0], [%[a_ptr], #32]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%q[a1], [%[a_ptr], #48]\n"
+
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla	v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #80]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #96]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                ASM_PREFETCH("[%[b_ptr], #512]")
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "subs	%w[k], %w[k], #1\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #112]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "ldr	%q[a0], [%[a_ptr]]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%q[a1], [%[a_ptr], #16]\n"
+                "bne	1b\n"
+
+                // Target to use when K is 1 or 2 (i.e. zero iterations of main loop)
+                "4:\n"
+
+                // Branch to alternative tail for odd K
+                "cbnz	%w[oddk], 2f\n"
+
+                // Detached final iteration (even K)
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "ldr	%q[b0], [%[b_ptr], #48]\n"
+
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "ldr	%q[b1], [%[b_ptr], #64]\n"
+
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "add	%[a_ptr], %[a_ptr], #64\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "ldr	%q[a0], [%[a_ptr], #-32]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "add	%[b_ptr], %[b_ptr], #96\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "ldr	%q[a1], [%[a_ptr], #-16]\n"
+
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #-16]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "str	q24, [%[c_ptr], #32]\n"
+
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                "b	3f\n"
+
+                // Detached final iteration (odd K)
+                "2:\n"
+                "fmla 	v8.4s , %[b0].4s, %[a0].s[0]\n"
+                "ldr	%q[b2], [%[b_ptr], #32]\n"
+                "fmla	v16.4s, %[b1].4s, %[a0].s[0]\n"
+                "fmla   v9.4s , %[b0].4s, %[a0].s[1]\n"
+                "str	q8, [%[c_ptr], #0]\n"
+                "fmla	v17.4s, %[b1].4s, %[a0].s[1]\n"
+                "str	q16, [%[c_ptr], #16]\n"
+                "fmla	v24.4s, %[b2].4s, %[a0].s[0]\n"
+                "add	%[b_ptr], %[b_ptr], #48\n"
+                "add	%[a_ptr], %[a_ptr], #32\n"
+                "str	q24, [%[c_ptr], #32]\n"
+                "fmla	v25.4s, %[b2].4s, %[a0].s[1]\n"
+                "str	q9, [%[c_ptr], #48]\n"
+
+                "fmla	v10.4s, %[b0].4s, %[a0].s[2]\n"
+                "str	q17, [%[c_ptr], #64]\n"
+                "fmla	v18.4s, %[b1].4s, %[a0].s[2]\n"
+                "str	q25, [%[c_ptr], #80]\n"
+                "fmla	v26.4s, %[b2].4s, %[a0].s[2]\n"
+                "str	q10, [%[c_ptr], #96]\n"
+
+                "fmla	v11.4s, %[b0].4s, %[a0].s[3]\n"
+                "str	q18, [%[c_ptr], #112]\n"
+                "fmla	v19.4s, %[b1].4s, %[a0].s[3]\n"
+                "str	q26, [%[c_ptr], #128]\n"
+                "fmla	v27.4s, %[b2].4s, %[a0].s[3]\n"
+                "str	q11, [%[c_ptr], #144]\n"
+
+                "fmla 	v12.4s, %[b0].4s, %[a1].s[0]\n"
+                "str	q19, [%[c_ptr], #160]\n"
+                "fmla	v20.4s, %[b1].4s, %[a1].s[0]\n"
+                "str	q27, [%[c_ptr], #176]\n"
+                "fmla	v28.4s, %[b2].4s, %[a1].s[0]\n"
+                "str	q12, [%[c_ptr], #192]\n"
+
+                "fmla   v13.4s, %[b0].4s, %[a1].s[1]\n"
+                "str	q20, [%[c_ptr], #208]\n"
+                "fmla	v21.4s, %[b1].4s, %[a1].s[1]\n"
+                "str	q28, [%[c_ptr], #224]\n"
+                "fmla	v29.4s, %[b2].4s, %[a1].s[1]\n"
+                "str	q13, [%[c_ptr], #240]\n"
+
+                "fmla	v14.4s, %[b0].4s, %[a1].s[2]\n"
+                "str	q21, [%[c_ptr], #256]\n"
+                "fmla	v22.4s, %[b1].4s, %[a1].s[2]\n"
+                "str	q29, [%[c_ptr], #272]\n"
+                "fmla	v30.4s, %[b2].4s, %[a1].s[2]\n"
+                "str	q14, [%[c_ptr], #288]\n"
+
+                "fmla	v15.4s, %[b0].4s, %[a1].s[3]\n"
+                "str	q22, [%[c_ptr], #304]\n"
+                "fmla	v23.4s, %[b1].4s, %[a1].s[3]\n"
+                "str	q30, [%[c_ptr], #320]\n"
+                "fmla	v31.4s, %[b2].4s, %[a1].s[3]\n"
+                "str	q15, [%[c_ptr], #336]\n"
+
+                // Common tail
+                "3:\n"
+                "str	q23, [%[c_ptr], #352]\n"
+                "str	q31, [%[c_ptr], #368]\n"
+                "add	%[c_ptr], %[c_ptr], #384\n"
+            :
+              [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [a0] "+w" (a0), [a1] "+w" (a1),
+              [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k)
+            : [oddk] "r" (oddk)
+            : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+              "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc"
+            );
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
index a7162c9..eb7136e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017,2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,7 +69,7 @@
 
     kern_type kernel = a64_sgemv_pretransposed;
 
-    sgemv_pretransposed(const CPUInfo *ci) { UNUSED(ci); }
+    sgemv_pretransposed(const CPUInfo *) { }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
index 165e0a6..0640cec 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
deleted file mode 100644
index cb7f239..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans/generic.cpp
+++ /dev/null

@@ -1,1072 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <cstddef>
-
-#include <arm_neon.h>
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-// Kernel implementation - transposed GEMV
-//
-// The kernel will process "M" rows of A (= steps of dot product) and "N"
-// columns (= dot products total)
-//
-// General plan is to do as many columns simultaneously as possible - a
-// reasonable limit is half the NEON regfile = 64 total accumulators.
-//
-// It's possible that messing around with sub-blocking M and N can yield
-// higher performance, but that's left to the outer loop.  In this kernel we
-// process all of M at the same time.
-
-
-// How far ahead to prefetch for the first and subsequent prefetches.
-// These values work for A72 on JunoR2...
-
-#define FIRST_PFD 9
-#define PFD 6
-
-namespace arm_gemm {
-
-void a64_sgemv_trans(const float *Astart, const float *Xstart, float *Ystart, float beta, int lda, int M, int N) {
-    const float *a_ptr_base = Astart;
-    float *y_ptr = Ystart;
-    const bool beta0 = (beta == 0.0f);
-
-    register const float32x4_t vb asm("v1") = vdupq_n_f32(beta);
-
-    int firstpfd=FIRST_PFD;
-    if (firstpfd > M) {
-        firstpfd = (M-1);
-    }
-
-    int pfd = PFD;
-    if (pfd > M) {
-        pfd = (M-1);
-    }
-
-    ptrdiff_t jump = lda * sizeof(int);
-
-    for (;N>=96;N-=96) {
-        int k = M-1;
-
-        const float *a_ptr = a_ptr_base;
-        const float *x_ptr = Xstart;
-        const float *pf_ptr = a_ptr;
-        const float *firstpf_ptr = a_ptr;
-        const float *pf_limit = a_ptr + (M * lda);
-
-        for (int i=0; i<firstpfd; i++) {
-            prefetch_1x(firstpf_ptr);
-            firstpf_ptr += lda;
-        }
-
-        for (int i=0; i<pfd; i++) {
-            prefetch_5x(pf_ptr + 16);
-            pf_ptr += lda;
-        }
-
-        a_ptr_base += 96;
-
-        __asm __volatile (
-            "movi	v8.4s,#0x0\n"
-            "ldr	w0, [%[x_ptr]]\n"
-            "movi	v9.4s,#0x0\n"
-            "ldr	q2,  [%[a_ptr], #0]\n"
-            "movi	v10.4s,#0x0\n"
-            "ldr	q3,  [%[a_ptr], #0x10]\n"
-            "movi	v11.4s,#0x0\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "movi	v12.4s,#0x0\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            "movi	v13.4s,#0x0\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "movi	v14.4s,#0x0\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "movi	v15.4s,#0x0\n"
-            ASM_PREFETCH("[%[firstpf_ptr]]")
-            "movi	v16.4s, #0x0\n"
-            "movi	v17.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #64]")
-            "movi	v18.4s, #0x0\n"
-            "movi	v19.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #128]")
-            "movi	v20.4s, #0x0\n"
-            "movi	v21.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #192]")
-            "movi	v22.4s, #0x0\n"
-            "movi	v23.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #256]")
-            "movi	v24.4s, #0x0\n"
-            "movi	v25.4s, #0x0\n"
-            ASM_PREFETCH("[%[pf_ptr], #320]")
-            "movi	v26.4s, #0x0\n"
-            "movi	v27.4s, #0x0\n"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "movi	v28.4s, #0x0\n"
-            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "movi	v29.4s, #0x0\n"
-            "movi	v30.4s, #0x0\n"
-            "movi	v31.4s, #0x0\n"
-
-            // Skip everything if there are no iterations of the main loop to do.
-            "cbz	%w[k], 10f\n"
-
-            // Loop with all prefetches.  Exit this loop when firstpf_ptr
-            // hits pf_limit.
-            "1:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #0x4\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            ASM_PREFETCH("[%[firstpf_ptr]]")
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "sub	%w[k], %w[k], #1\n"
-            ASM_PREFETCH("[%[x_ptr], #128]")
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x00]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "cmp	%[firstpf_ptr], %[pf_limit]\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "blt	1b\n"
-
-            // Check that there are still "main" prefetches to do.
-            "cmp	%[pf_ptr], %[pf_limit]\n"
-            "bge	9f\n"
-
-            // Just the main prefetches, exit this loop when pf_ptr hits pf_limit.
-            "8:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #0x4\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "sub	%w[k], %w[k], #1\n"
-            ASM_PREFETCH("[%[x_ptr], #128]")
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x00]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "cmp	%[pf_ptr], %[pf_limit]\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "blt	8b\n"
-
-            // Check that there is still work to do.
-            "9:\n"
-            "cmp	%w[k], #0\n"
-            "beq	10f\n"
-
-            // Loop without prefetches, exit when k hits 0.
-            "2:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #0x4\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "subs	%w[k], %w[k], #1\n"
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x00]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x30]\n"
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x50]\n"
-            "bne	2b\n"
-
-            "10:\n"
-
-            // Final iteration
-            "dup	v0.4s, w0\n"
-            "fmla	v8.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x70]\n"
-            "fmla	v10.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x80]\n"
-            "fmla	v11.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x90]\n"
-            "fmla	v12.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0xa0]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0xb0]\n"
-            "fmla	v14.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0xc0]\n"
-            "fmla	v15.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0xd0]\n"
-            "fmla	v16.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0xe0]\n"
-            "fmla	v17.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0xf0]\n"
-            "fmla	v18.4s, v6.4s, v0.4s\n"
-
-            "ldr	q6, [%[a_ptr], #0x100]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x110]\n"
-            "fmla	v20.4s, v2.4s, v0.4s\n"
-            "ldr	q2, [%[a_ptr], #0x120]\n"
-            "fmla	v21.4s, v3.4s, v0.4s\n"
-            "ldr	q3, [%[a_ptr], #0x130]\n"
-            "fmla	v22.4s, v4.4s, v0.4s\n"
-            "ldr	q4, [%[a_ptr], #0x140]\n"
-            "fmla	v23.4s, v5.4s, v0.4s\n"
-            "ldr	q5, [%[a_ptr], #0x150]\n"
-            "fmla	v24.4s, v6.4s, v0.4s\n"
-            "ldr	q6, [%[a_ptr], #0x160]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "ldr	q7, [%[a_ptr], #0x170]\n"
-            "fmla	v26.4s, v2.4s, v0.4s\n"
-            "cbnz	%w[beta0], 11f\n"
-            "ldr	q2,  [%[y_ptr]]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "ldr	q3,  [%[y_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "ldr	q4,  [%[y_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "ldr	q5,  [%[y_ptr], #0x30]\n"
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "ldr	q6,  [%[y_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-            "ldr	q7,  [%[y_ptr], #0x50]\n"
-
-            "fmla	v8.4s, v2.4s, %[vb].4s\n"
-            "ldr	q2, [%[y_ptr], #0x60]\n"
-            "fmla	v9.4s, v3.4s, %[vb].4s\n"
-            "ldr	q3, [%[y_ptr], #0x70]\n"
-            "fmla	v10.4s, v4.4s, %[vb].4s\n"
-            "ldr	q4, [%[y_ptr], #0x80]\n"
-            "fmla	v11.4s, v5.4s, %[vb].4s\n"
-            "ldr	q5, [%[y_ptr], #0x90]\n"
-            "fmla	v12.4s, v6.4s, %[vb].4s\n"
-            "ldr	q6, [%[y_ptr], #0xa0]\n"
-            "str	q8, [%[y_ptr], #0x00]\n"
-            "fmla	v13.4s, v7.4s, %[vb].4s\n"
-            "ldr	q7, [%[y_ptr], #0xb0]\n"
-            "str	q9, [%[y_ptr], #0x10]\n"
-            "fmla	v14.4s, v2.4s, %[vb].4s\n"
-            "ldr	q2, [%[y_ptr], #0xc0]\n"
-            "str	q10, [%[y_ptr], #0x20]\n"
-            "fmla	v15.4s, v3.4s, %[vb].4s\n"
-            "ldr	q3, [%[y_ptr], #0xd0]\n"
-            "str	q11, [%[y_ptr], #0x30]\n"
-            "fmla	v16.4s, v4.4s, %[vb].4s\n"
-            "ldr	q4, [%[y_ptr], #0xe0]\n"
-            "str	q12, [%[y_ptr], #0x40]\n"
-            "fmla	v17.4s, v5.4s, %[vb].4s\n"
-            "ldr	q5, [%[y_ptr], #0xf0]\n"
-            "str	q13, [%[y_ptr], #0x50]\n"
-            "fmla	v18.4s, v6.4s, %[vb].4s\n"
-            "ldr	q6, [%[y_ptr], #0x100]\n"
-            "str	q14, [%[y_ptr], #0x60]\n"
-            "fmla	v19.4s, v7.4s, %[vb].4s\n"
-            "ldr	q7, [%[y_ptr], #0x110]\n"
-            "str	q15, [%[y_ptr], #0x70]\n"
-            "fmla	v20.4s, v2.4s, %[vb].4s\n"
-            "ldr	q2, [%[y_ptr], #0x120]\n"
-            "str	q16, [%[y_ptr], #0x80]\n"
-            "fmla	v21.4s, v3.4s, %[vb].4s\n"
-            "ldr	q3, [%[y_ptr], #0x130]\n"
-            "str	q17, [%[y_ptr], #0x90]\n"
-            "fmla	v22.4s, v4.4s, %[vb].4s\n"
-            "ldr	q4, [%[y_ptr], #0x140]\n"
-            "str	q18, [%[y_ptr], #0xa0]\n"
-            "fmla	v23.4s, v5.4s, %[vb].4s\n"
-            "ldr	q5, [%[y_ptr], #0x150]\n"
-            "str	q19, [%[y_ptr], #0xb0]\n"
-            "fmla	v24.4s, v6.4s, %[vb].4s\n"
-            "ldr	q6, [%[y_ptr], #0x160]\n"
-            "str	q20, [%[y_ptr], #0xc0]\n"
-            "fmla	v25.4s, v7.4s, %[vb].4s\n"
-            "ldr	q7, [%[y_ptr], #0x170]\n"
-            "str	q21, [%[y_ptr], #0xd0]\n"
-            "fmla	v26.4s, v2.4s, %[vb].4s\n"
-            "str	q22, [%[y_ptr], #0xe0]\n"
-            "fmla	v27.4s, v3.4s, %[vb].4s\n"
-            "str	q23, [%[y_ptr], #0xf0]\n"
-            "fmla	v28.4s, v4.4s, %[vb].4s\n"
-            "str	q24, [%[y_ptr], #0x100]\n"
-            "fmla	v29.4s, v5.4s, %[vb].4s\n"
-            "str	q25, [%[y_ptr], #0x110]\n"
-            "fmla	v30.4s, v6.4s, %[vb].4s\n"
-            "str	q26, [%[y_ptr], #0x120]\n"
-            "fmla	v31.4s, v7.4s, %[vb].4s\n"
-            "str	q27, [%[y_ptr], #0x130]\n"
-            "b		12f\n"
-
-            // beta 0 code - don't read.
-            "11:\n"
-            "str	q8, [%[y_ptr], #0x00]\n"
-            "fmla	v27.4s, v3.4s, v0.4s\n"
-            "str	q9, [%[y_ptr], #0x10]\n"
-            "fmla	v28.4s, v4.4s, v0.4s\n"
-            "str	q10, [%[y_ptr], #0x20]\n"
-            "fmla	v29.4s, v5.4s, v0.4s\n"
-            "str	q11, [%[y_ptr], #0x30]\n"
-            "fmla	v30.4s, v6.4s, v0.4s\n"
-            "str	q12, [%[y_ptr], #0x40]\n"
-            "fmla	v31.4s, v7.4s, v0.4s\n"
-
-            "str	q13, [%[y_ptr], #0x50]\n"
-            "str	q14, [%[y_ptr], #0x60]\n"
-            "str	q15, [%[y_ptr], #0x70]\n"
-            "str	q16, [%[y_ptr], #0x80]\n"
-            "str	q17, [%[y_ptr], #0x90]\n"
-            "str	q18, [%[y_ptr], #0xa0]\n"
-            "str	q19, [%[y_ptr], #0xb0]\n"
-            "str	q20, [%[y_ptr], #0xc0]\n"
-            "str	q21, [%[y_ptr], #0xd0]\n"
-            "str	q22, [%[y_ptr], #0xe0]\n"
-            "str	q23, [%[y_ptr], #0xf0]\n"
-            "str	q24, [%[y_ptr], #0x100]\n"
-            "str	q25, [%[y_ptr], #0x110]\n"
-            "str	q26, [%[y_ptr], #0x120]\n"
-            "str	q27, [%[y_ptr], #0x130]\n"
-
-            "12:\n"
-            "stp	q28, q29, [%[y_ptr], #0x140]\n"
-            "stp	q30, q31, [%[y_ptr], #0x160]\n"
-            "add	%[y_ptr], %[y_ptr], #0x180\n"
-
-
-
-          : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k), [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr)
-          : [jump] "r" (jump), [vb] "w" (vb), [pf_limit] "r" (pf_limit), [beta0] "r" (beta0)
-          : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8",  "v9", "v10", "v11", "v12", "v13",
-            "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-            "v27", "v28", "v29", "v30", "v31", "cc"
-        );
-    }
-
-    if (N>0) {
-        // Handle N tail - up to 95 stragglers.
-        // This is 0-23 vectors, plus optionally an 64-bit vector and/or a
-        // single value for the remainder.
-
-        // Independent pointers into the matrix for the odd 2 and odd 1.
-        // Double up as flag to indicate whether they are needed.
-        const float *odd2_aptr=NULL;
-        const float *odd1_aptr=NULL;
-
-        // Figure out how much work we need to do.
-        int numvecs = N/4;
-        int rem = N%4;
-        int k=M;
-
-        // Set up pointers for the odd 2/1 if needed.
-        if (rem >= 2) {
-            odd2_aptr = a_ptr_base + (numvecs * 4);
-        }
-
-        if (rem & 1) {
-            odd1_aptr = a_ptr_base + (numvecs * 4) + (odd2_aptr==NULL ? 0 : 2);
-        }
-
-        const float *a_ptr = a_ptr_base;
-        const float *firstpf_ptr = a_ptr_base;
-        const float *pf_ptr = a_ptr_base;
-        const float *pf_limit = a_ptr + (M * lda);
-
-        const float *x_ptr = Xstart;
-        int vecs=0; // Working variable to count how many vectors to work on.
-        int dopf=1; // Track whether we are doing prefetches.
-
-        // Figure out how many cache lines we need to prefetch each time.
-        int numpfs = (N + 15) / 16;
-
-        // Do initial prefetches
-        for (int i=0; i<firstpfd+1; i++) {
-            prefetch_1x(firstpf_ptr);
-            firstpf_ptr += lda;
-        }
-
-        // Do "main" prefetches - adapt number to the number we actually need.
-        if (numpfs > 1) {
-            for (int i=0; i<pfd+1; i++) {
-                switch (numpfs) {
-                    case 2:
-                        prefetch_1x(pf_ptr + 16);
-                        break;
-
-                    case 3:
-                        prefetch_2x(pf_ptr + 16);
-                        break;
-
-                    case 4:
-                        prefetch_3x(pf_ptr + 16);
-                        break;
-
-                    case 5:
-                        prefetch_4x(pf_ptr + 16);
-                        break;
-
-                    case 6:
-                        prefetch_5x(pf_ptr + 16);
-                        break;
-
-                    default:
-                        UNREACHABLE("Impossible.");
-                }
-                pf_ptr += lda;
-            }
-        } else {
-            // Just disable additional prefetches
-            dopf=0;
-        }
-
-        // Do the real work
-        __asm __volatile (
-            // Initialize all the vectors - not worth skipping this if only
-            // some are needed.
-            "movi	v8.4s,#0x0\n"
-            "ldr	w0, [%[x_ptr]]\n"
-            "movi	v9.4s,#0x0\n"
-            "movi	v10.4s,#0x0\n"
-            "movi	v11.4s,#0x0\n"
-            "movi	v12.4s,#0x0\n"
-            "movi	v13.4s,#0x0\n"
-            "movi	v14.4s,#0x0\n"
-            "movi	v15.4s,#0x0\n"
-            "movi	v16.4s, #0x0\n"
-            "movi	v17.4s, #0x0\n"
-            "movi	v18.4s, #0x0\n"
-            "movi	v19.4s, #0x0\n"
-            "movi	v20.4s, #0x0\n"
-            "movi	v21.4s, #0x0\n"
-            "movi	v22.4s, #0x0\n"
-            "movi	v23.4s, #0x0\n"
-            "movi	v24.4s, #0x0\n"
-            "movi	v25.4s, #0x0\n"
-            "movi	v26.4s, #0x0\n"
-            "movi	v27.4s, #0x0\n"
-            "movi	v28.4s, #0x0\n"
-            "movi	v29.4s, #0x0\n"
-            "movi	v30.4s, #0x0\n"
-            "movi	v6.2s, #0x0\n"
-            "movi	v5.2s, #0x0\n"
-
-            "1:\n"
-            ASM_PREFETCH("[%[firstpf_ptr]]\n")
-            "11:\n"
-            "dup	v0.4s, w0\n"
-            "ldr	w0, [%[x_ptr], #4]\n"
-            "add	%[x_ptr], %[x_ptr], #4\n"
-
-            "cbz	%w[numvecs], 2f\n"
-            "mov	%w[vecs], %w[numvecs]\n"
-
-            // Vector 0
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x00]\n"
-            "fmla	v8.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 1
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x10]\n"
-            "fmla	v9.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 2
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x20]\n"
-            "fmla	v10.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 3
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x30]\n"
-            "fmla	v11.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 3f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x40]")
-            "3:\n"
-            "beq	2f\n"
-
-            // Vector 4
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x40]\n"
-            "fmla	v12.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 5
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x50]\n"
-            "fmla	v13.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 6
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x60]\n"
-            "fmla	v14.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 7
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x70]\n"
-            "fmla	v15.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 4f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x80]")
-            "4:\n"
-            "beq	2f\n"
-
-            // Vector 8
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x80]\n"
-            "fmla	v16.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 9
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x90]\n"
-            "fmla	v17.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 10
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xa0]\n"
-            "fmla	v18.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 11
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xb0]\n"
-            "fmla	v19.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 5f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0xc0]")
-            "5:\n"
-            "beq	2f\n"
-
-            // Vector 12
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xc0]\n"
-            "fmla	v20.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 13
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xd0]\n"
-            "fmla	v21.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 14
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xe0]\n"
-            "fmla	v22.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 15
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0xf0]\n"
-            "fmla	v23.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 6f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x100]")
-            "6:\n"
-            "beq	2f\n"
-
-            // Vector 16
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x100]\n"
-            "fmla	v24.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 17
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x110]\n"
-            "fmla	v25.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 18
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x120]\n"
-            "fmla	v26.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 19
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x130]\n"
-            "fmla	v27.4s, v7.4s, v0.4s\n"
-            // Prefetch
-            "cbz	%w[dopf], 7f\n"
-            ASM_PREFETCH("[%[pf_ptr], #0x140]")
-            "7:\n"
-            "beq	2f\n"
-
-            // Vector 20
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x140]\n"
-            "fmla	v28.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 21
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x150]\n"
-            "fmla	v29.4s, v7.4s, v0.4s\n"
-            "beq	2f\n"
-            // Vector 22
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7,[%[a_ptr], #0x160]\n"
-            "fmla	v30.4s, v7.4s, v0.4s\n"
-
-            "2:\n"
-            "add	%[a_ptr], %[a_ptr], %[jump]\n"
-
-            // Do the odd 2-vector, if needed
-            "cbz	%[odd2_aptr], 8f\n"
-            "ldr	d7, [%[odd2_aptr]]\n"
-            "fmla	v6.2s, v7.2s, v0.2s\n"
-            "add	%[odd2_aptr], %[odd2_aptr], %[jump]\n"
-
-            "8:\n"
-            // Do the odd 1-vector, if needed
-            "cbz	%[odd1_aptr], 9f\n"
-            "ldr	s7, [%[odd1_aptr]]\n"
-            "fmla	v5.2s, v7.2s, v0.2s\n"
-            "add	%[odd1_aptr], %[odd1_aptr], %[jump]\n"
-
-            // Get out if needed.
-            "9:\n"
-            "subs	%w[k], %w[k], #1\n"
-            "beq	10f\n"
-
-            // Update the "main" prefetch pointer, if it strays beyond the limit turn off "dopf"
-            "add	%[pf_ptr], %[pf_ptr], %[jump]\n"
-            "cmp	%[pf_ptr], %[pf_limit]\n"
-            "csel	%w[dopf], %w[dopf], WZR, LT\n"
-
-            // Update the "leading" prefetch pointer, don't do the first
-            // instruction of the loop if it's over the limit.
-            "add	%[firstpf_ptr], %[firstpf_ptr], %[jump]\n"
-            "cmp	%[firstpf_ptr], %[pf_limit]\n"
-            "blt	1b\n"
-            "b		11b\n"
-
-            // Now write out the outputs
-            "10:\n"
-            "cbnz	%w[beta0], 15f\n"
-
-            "cbz	%w[numvecs], 12f\n"
-            "mov	%w[vecs], %w[numvecs]\n"
-
-            // Vector 0
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v8.4s, v7.4s, %[vb].4s\n"
-            "str	q8, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 1
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v9.4s, v7.4s, %[vb].4s\n"
-            "str	q9, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 2
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v10.4s, v7.4s, %[vb].4s\n"
-            "str	q10, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 3
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v11.4s, v7.4s, %[vb].4s\n"
-            "str	q11, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 4
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v12.4s, v7.4s, %[vb].4s\n"
-            "str	q12, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 5
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v13.4s, v7.4s, %[vb].4s\n"
-            "str	q13, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 6
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v14.4s, v7.4s, %[vb].4s\n"
-            "str	q14, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 7
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v15.4s, v7.4s, %[vb].4s\n"
-            "str	q15, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 8
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v16.4s, v7.4s, %[vb].4s\n"
-            "str	q16, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 9
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v17.4s, v7.4s, %[vb].4s\n"
-            "str	q17, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 10
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v18.4s, v7.4s, %[vb].4s\n"
-            "str	q18, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 11
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v19.4s, v7.4s, %[vb].4s\n"
-            "str	q19, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 12
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v20.4s, v7.4s, %[vb].4s\n"
-            "str	q20, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 13
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v21.4s, v7.4s, %[vb].4s\n"
-            "str	q21, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 14
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v22.4s, v7.4s, %[vb].4s\n"
-            "str	q22, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 15
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v23.4s, v7.4s, %[vb].4s\n"
-            "str	q23, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 16
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v24.4s, v7.4s, %[vb].4s\n"
-            "str	q24, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 17
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v25.4s, v7.4s, %[vb].4s\n"
-            "str	q25, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 18
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v26.4s, v7.4s, %[vb].4s\n"
-            "str	q26, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 19
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v27.4s, v7.4s, %[vb].4s\n"
-            "str	q27, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 20
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v28.4s, v7.4s, %[vb].4s\n"
-            "str	q28, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 21
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v29.4s, v7.4s, %[vb].4s\n"
-            "str	q29, [%[y_ptr]], #0x10\n"
-            "beq	12f\n"
-            // Vector 22
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "ldr	q7, [%[y_ptr]]\n"
-            "fmla	v30.4s, v7.4s, %[vb].4s\n"
-            "str	q30, [%[y_ptr]], #0x10\n"
-
-            // Odd 2
-            "12:\n"
-            "cbz	%[odd2_aptr], 13f\n"
-            "ldr	d7, [%[y_ptr]]\n"
-            "fmla	v6.2s, v7.2s, %[vb].2s\n"
-            "str	d6, [%[y_ptr]], #0x8\n"
-
-            // Odd 1
-            "13:\n"
-            "cbz	%[odd1_aptr], 14f\n"
-            "ldr	s7, [%[y_ptr]]\n"
-            "fmla	v5.2s, v7.2s, %[vb].2s\n"
-            "str	s5, [%[y_ptr]]\n"
-            "b		14f\n"
-
-            "15:\n"
-            // beta0 code
-            "cbz	%w[numvecs], 16f\n"
-            "mov	%w[vecs], %w[numvecs]\n"
-
-            // Vector 0
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q8, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 1
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q9, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 2
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q10, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 3
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q11, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 4
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q12, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 5
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q13, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 6
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q14, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 7
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q15, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 8
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q16, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 9
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q17, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 10
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q18, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 11
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q19, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 12
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q20, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 13
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q21, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 14
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q22, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 15
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q23, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 16
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q24, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 17
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q25, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 18
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q26, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 19
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q27, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 20
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q28, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 21
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q29, [%[y_ptr]], #0x10\n"
-            "beq	16f\n"
-            // Vector 22
-            "subs	%w[vecs], %w[vecs], #1\n"
-            "str	q30, [%[y_ptr]], #0x10\n"
-
-            // Odd 2
-            "16:\n"
-            "cbz	%[odd2_aptr], 17f\n"
-            "str	d6, [%[y_ptr]], #0x8\n"
-
-            // Odd 1
-            "17:\n"
-            "cbz	%[odd1_aptr], 14f\n"
-            "str	s5, [%[y_ptr]]\n"
-
-            "14:\n"
-          : [a_ptr] "+r" (a_ptr), [x_ptr] "+r" (x_ptr), [y_ptr] "+r" (y_ptr), [k] "+r" (k),
-            [pf_ptr] "+r" (pf_ptr), [firstpf_ptr] "+r" (firstpf_ptr),
-            [odd1_aptr] "+r" (odd1_aptr), [odd2_aptr] "+r" (odd2_aptr),
-            [dopf] "+r" (dopf), [vecs] "+r" (vecs)
-          : [jump] "r" (jump), [vb] "w" (vb), [pf_limit] "r" (pf_limit), [numvecs] "r" (numvecs), [beta0] "r" (beta0)
-          : "w0", "v0", "v2", "v3", "v4", "v5", "v6", "v7", "v8",  "v9", "v10", "v11", "v12", "v13",
-            "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
-            "v27", "v28", "v29", "v30", "v31", "cc"
-        );
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp
index 352a147..6f31efe 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,7 @@
         return 1;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return false;
     }
@@ -75,7 +75,10 @@
     // Default to the generic kernel
     kern_type kernel=a64_smallK_hybrid_fp32_mla_4x6;
 
-    smallK_hybrid_fp32_mla_4x6(const CPUInfo *ci) { UNUSED(ci); }
+    smallK_hybrid_fp32_mla_4x6(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp
index d8e8e52..e2fec6a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp
index c5d39cb..e9a0948 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,8 +25,6 @@
 
 #ifdef __aarch64__
 
-
-
 namespace arm_gemm
 {
 
@@ -57,7 +55,7 @@
         return 1;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return false;
     }
@@ -77,7 +75,10 @@
     // Default to the generic kernel
     kern_type kernel=a64_smallK_hybrid_fp32_mla_4x8;
 
-    smallK_hybrid_fp32_mla_4x8(const CPUInfo *ci) { UNUSED(ci); }
+    smallK_hybrid_fp32_mla_4x8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp
index 7ad5205..11888bc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp
index fcb188a..fc087b7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return false;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp
index 6d71840..2d6d2f0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp
index d408213..88ad36a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp
index bfe896b..3de708c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return false;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp
index c742fcc..7135f2e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp
index 2df2c30..c94e975 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp
index d817b9f..76931db 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return false;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp
index ceb4a3b..02894d8 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp
index 4eecee4..fe69f74 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp
index b825333..d91416c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return false;
     }

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp
index f30f7e2..e70fb69 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp
index 16c1d17..2a7dd3d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp
index ac9a8d2..eba98bb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 2;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }
@@ -78,7 +78,10 @@
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_bf16fp32_dot_4VLx4;
 
-    hybrid_bf16fp32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+    hybrid_bf16fp32_dot_4VLx4(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
index 1ee7b1c..385a16f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
+void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
     const int K_stride = ((K + 1) / 2) * 2;
     const long loops_count = ((K + 8) / 16) - 1;
     K -= loops_count * 16;
@@ -41,7 +41,7 @@
     const long leftovers = K;
     const long blocks_count = (K + 1) / 2;
     float nullbias[256];
-    if (!append && !bias) {
+    if (!accumulate && !bias) {
         memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
     }
     float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
@@ -62,12 +62,23 @@
             break;
     }
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const bfloat16 * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(bfloat16);
 
         float *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
             const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
             long loops = loops_count;
@@ -79,7 +90,7 @@
             const unsigned long ldcb = ldc * sizeof(float);
             const float *biasptr = bias ? bias+x0 : nullbias;
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "whilelt p6.h, %[temp], %[leftovers]\n"
@@ -91,7 +102,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z16.s, p0/z, [%[biasptr]]\n"
                         "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -387,7 +398,7 @@
                         "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -406,7 +417,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z16.s, p0/z, [%[biasptr]]\n"
                         "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -848,7 +859,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -871,7 +882,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z16.s, p0/z, [%[biasptr]]\n"
                         "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -1459,7 +1470,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -1487,7 +1498,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z16.s, p0/z, [%[biasptr]]\n"
                         "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -2221,7 +2232,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
index d889f99..641e5c1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }
@@ -78,7 +78,10 @@
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_bf16fp32_mmla_4VLx4;
 
-    hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+    hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
index e3debe5..76e3546 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
+void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
     const int K_stride = ((K + 3) / 4) * 4;
     const long loops_count = ((K + 8) / 16) - 1;
     K -= loops_count * 16;
@@ -41,7 +41,7 @@
     const long leftovers = K;
     const long blocks_count = (K + 3) / 4;
     float nullbias[128];
-    if (!append && !bias) {
+    if (!accumulate && !bias) {
         memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
     }
     float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
@@ -62,12 +62,23 @@
             break;
     }
 
-    for (int y=0; y<M; y+=8) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const bfloat16 * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(bfloat16);
 
         float *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 8) {
+            if (rows_to_compute % 8) {
+                rows_to_compute = 8 - 1;
+            } else {
+                rows_to_compute = 8;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
             const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
             long loops = loops_count;
@@ -79,7 +90,7 @@
             const unsigned long ldcb = ldc * sizeof(float);
             const float *biasptr = bias ? bias+x0 : nullbias;
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "whilelt p6.h, %[temp], %[leftovers]\n"
@@ -87,7 +98,7 @@
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.h\n"
                         "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z1.h, #0\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
@@ -287,7 +298,7 @@
                         "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -302,7 +313,7 @@
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.h\n"
                         "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
@@ -513,7 +524,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -532,7 +543,7 @@
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.h\n"
                         "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z3.h, #0\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
@@ -858,7 +869,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -881,7 +892,7 @@
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.h\n"
                         "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
@@ -1218,7 +1229,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;
@@ -1245,7 +1256,7 @@
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.h\n"
                         "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z5.h, #0\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
@@ -1697,7 +1708,7 @@
                         ".unreq c_ptr3\n"
                         ".unreq c_ptr4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
                     );
                     break;
@@ -1728,7 +1739,7 @@
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.h\n"
                         "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
@@ -2191,7 +2202,7 @@
                         ".unreq c_ptr4\n"
                         ".unreq c_ptr5\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
                     );
                     break;
@@ -2226,7 +2237,7 @@
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.h\n"
                         "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z7.h, #0\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
@@ -2804,7 +2815,7 @@
                         ".unreq c_ptr5\n"
                         ".unreq c_ptr6\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
                     );
                     break;
@@ -2844,7 +2855,7 @@
                         "incw %[temp], all, mul #1\n"
                         "ptrue p7.h\n"
                         "whilelt p1.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
@@ -3433,7 +3444,7 @@
                         ".unreq c_ptr6\n"
                         ".unreq c_ptr7\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
index affcafe..bd457e9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }
@@ -78,7 +78,10 @@
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_bf16fp32_mmla_6VLx2;
 
-    hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *ci) { UNUSED(ci); }
+    hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
index 07ecbf3..59dc6dc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
+void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
     const int K_stride = ((K + 3) / 4) * 4;
     const long loops_count = ((K + 8) / 16) - 1;
     K -= loops_count * 16;
@@ -41,7 +41,7 @@
     const long leftovers = K;
     const long blocks_count = (K + 3) / 4;
     float nullbias[192];
-    if (!append && !bias) {
+    if (!accumulate && !bias) {
         memset(nullbias, 0, (3 * get_vector_length<float>() * sizeof(float)));
     }
     float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
@@ -62,12 +62,23 @@
             break;
     }
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const bfloat16 * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(bfloat16);
 
         float *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=(3 * get_vector_length<float>())) {
             const long width = std::min((unsigned long)N-x0, (3 * get_vector_length<float>()));
             long loops = loops_count;
@@ -79,7 +90,7 @@
             const unsigned long ldcb = ldc * sizeof(float);
             const float *biasptr = bias ? bias+x0 : nullbias;
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "whilelt p6.h, %[temp], %[leftovers]\n"
@@ -89,7 +100,7 @@
                         "whilelt p1.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z1.h, #0\n"
                         "ld1w z19.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
@@ -362,7 +373,7 @@
                         "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -379,7 +390,7 @@
                         "whilelt p1.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z19.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
@@ -665,7 +676,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -686,7 +697,7 @@
                         "whilelt p1.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z3.h, #0\n"
                         "ld1w z19.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
@@ -1127,7 +1138,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -1153,7 +1164,7 @@
                         "whilelt p1.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p2.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z19.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
@@ -1607,7 +1618,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
index af8babd..f25f747 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }
@@ -78,7 +78,10 @@
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_bf16fp32_mmla_8VLx2;
 
-    hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *ci) { UNUSED(ci); }
+    hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
index 7319616..f38a2ea 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
+void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
     const int K_stride = ((K + 3) / 4) * 4;
     const long loops_count = ((K + 8) / 16) - 1;
     K -= loops_count * 16;
@@ -41,7 +41,7 @@
     const long leftovers = K;
     const long blocks_count = (K + 3) / 4;
     float nullbias[256];
-    if (!append && !bias) {
+    if (!accumulate && !bias) {
         memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
     }
     float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
@@ -62,12 +62,23 @@
             break;
     }
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const bfloat16 * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(bfloat16);
 
         float *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
             const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
             long loops = loops_count;
@@ -79,7 +90,7 @@
             const unsigned long ldcb = ldc * sizeof(float);
             const float *biasptr = bias ? bias+x0 : nullbias;
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "whilelt p6.h, %[temp], %[leftovers]\n"
@@ -91,7 +102,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z1.h, #0\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
@@ -432,7 +443,7 @@
                         "st1w z3.s, p3, [%[c_ptr0], #3, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -451,7 +462,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
@@ -807,7 +818,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -830,7 +841,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z3.h, #0\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
@@ -1381,7 +1392,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -1409,7 +1420,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z15.s, p0/z, [%[biasptr]]\n"
                         "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
                         "add %[a_ptr0], %[a_ptr0], #0x10\n"
@@ -1975,7 +1986,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
index 28ef807..ebef413 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 1;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }
@@ -78,7 +78,10 @@
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_fp16_mla_4VLx4;
 
-    hybrid_fp16_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+    hybrid_fp16_mla_4VLx4(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
index 2998f33..7610a20 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, int M, int N, int K, const __fp16 *bias, Activation act, bool append) {
+void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, int M, int N, int K, const __fp16 *bias, Activation act, bool accumulate) {
     const int K_stride = K;
     const long loops_count = ((K + 8) / 16) - 1;
     K -= loops_count * 16;
@@ -40,7 +40,7 @@
     K -= (regs_count + 1) * 8;
     const long leftovers = K;
     __fp16 nullbias[512];
-    if (!append && !bias) {
+    if (!accumulate && !bias) {
         memset(nullbias, 0, (4 * get_vector_length<__fp16>() * sizeof(__fp16)));
     }
     __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
@@ -61,12 +61,23 @@
             break;
     }
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const __fp16 * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(__fp16);
 
         __fp16 *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
             const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
             long loops = loops_count;
@@ -78,7 +89,7 @@
             const unsigned long ldcb = ldc * sizeof(__fp16);
             const __fp16 *biasptr = bias ? bias+x0 : nullbias;
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "whilelt p6.h, %[temp], %[leftovers]\n"
@@ -90,7 +101,7 @@
                         "whilelt p2.h, %[temp], %[width]\n"
                         "inch %[temp], all, mul #1\n"
                         "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1h z16.h, p0/z, [%[biasptr]]\n"
                         "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -613,7 +624,7 @@
                         "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -632,7 +643,7 @@
                         "whilelt p2.h, %[temp], %[width]\n"
                         "inch %[temp], all, mul #1\n"
                         "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1h z16.h, p0/z, [%[biasptr]]\n"
                         "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -1405,7 +1416,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -1428,7 +1439,7 @@
                         "whilelt p2.h, %[temp], %[width]\n"
                         "inch %[temp], all, mul #1\n"
                         "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1h z16.h, p0/z, [%[biasptr]]\n"
                         "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -2451,7 +2462,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -2479,7 +2490,7 @@
                         "whilelt p2.h, %[temp], %[width]\n"
                         "inch %[temp], all, mul #1\n"
                         "whilelt p3.h, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1h z16.h, p0/z, [%[biasptr]]\n"
                         "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -3752,7 +3763,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
index 8e3c179..1bc8021 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 1;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }
@@ -78,7 +78,10 @@
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_fp32_mla_4VLx4;
 
-    hybrid_fp32_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+    hybrid_fp32_mla_4VLx4(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
index 855d27a..ce36243 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
+void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
     const int K_stride = K;
     const long loops_count = ((K + 4) / 8) - 1;
     K -= loops_count * 8;
@@ -40,7 +40,7 @@
     K -= (regs_count + 1) * 4;
     const long leftovers = K;
     float nullbias[256];
-    if (!append && !bias) {
+    if (!accumulate && !bias) {
         memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
     }
     float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
@@ -61,12 +61,23 @@
             break;
     }
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const float * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(float);
 
         float *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
             const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
             long loops = loops_count;
@@ -78,7 +89,7 @@
             const unsigned long ldcb = ldc * sizeof(float);
             const float *biasptr = bias ? bias+x0 : nullbias;
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "whilelt p6.s, %[temp], %[leftovers]\n"
@@ -90,7 +101,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z16.s, p0/z, [%[biasptr]]\n"
                         "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -366,7 +377,7 @@
                         "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -385,7 +396,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z16.s, p0/z, [%[biasptr]]\n"
                         "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -799,7 +810,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -822,7 +833,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z16.s, p0/z, [%[biasptr]]\n"
                         "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -1374,7 +1385,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -1402,7 +1413,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "ld1w z16.s, p0/z, [%[biasptr]]\n"
                         "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
                         "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
@@ -2092,7 +2103,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp
similarity index 74%
rename from src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp
index 19e5fbd..fd416ed 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,39 +25,42 @@
 
 #ifdef __ARM_FEATURE_SVE
 
+
+#include "../std_transforms_sve.hpp"
+
 namespace arm_gemm
 {
 
 // Actual kernel implementations
-void sve_native_fp32_mla_4VLx4(const float *, int, const float *, int ldb, float *, int, int, int, int, const float *, Activation, bool);
+void sve_hybrid_fp32_mmla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
 
-class native_fp32_mla_4VLx4
+class hybrid_fp32_mmla_4VLx4
 {
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, int, const float *, int ldb, float *, int, int, int, int, const float *, Activation, bool);
+    typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
 
     /* Kernel blocking parameters */
     static constexpr unsigned int out_height()
     {
-        return 4;
+        return 8;
     }
 
     static unsigned int out_width()
     {
-        return get_vector_length<float>() * 4;
+        return get_vector_length<float>() * 2;
     }
 
     static constexpr unsigned int k_unroll()
     {
-        return 1;
+        return 2;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
-        return false;
+        return true;
     }
 
     static constexpr bool supports_bias()
@@ -70,12 +73,15 @@
         return true;
     }
 
-
+    StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {};
 
     // Default to the generic kernel
-    kern_type kernel=sve_native_fp32_mla_4VLx4;
+    kern_type kernel=sve_hybrid_fp32_mmla_4VLx4;
 
-    native_fp32_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+    hybrid_fp32_mmla_4VLx4(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
new file mode 100644
index 0000000..1364585
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp

@@ -0,0 +1,3459 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+
+#include "../../asmlib.hpp"
+#include "../../utils.hpp"
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mmla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
+    const int K_stride = ((K + 1) / 2) * 2;
+    const long loops_count = ((K + 4) / 8) - 1;
+    K -= loops_count * 8;
+    const long regs_count = (K / 4) - 1;
+    K -= (regs_count + 1) * 4;
+    const long leftovers = K;
+    const long blocks_count = (K + 1) / 2;
+    float nullbias[128];
+    if (!accumulate && !bias) {
+        memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
+    }
+    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
+    const float * const minptr = &minval;
+    const float * const maxptr = &maxval;
+
+    switch(act.type)
+    {
+        default:
+        case Activation::Type::None:
+            break;
+        case Activation::Type::BoundedReLU:
+            maxval = static_cast<float>(act.param1);
+            /* fall through */
+        case Activation::Type::ReLU:
+            minval = 0.0f;
+            break;
+    }
+
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
+        const float * const a_ptr0_base = A + (y * lda);
+        const unsigned long ldab = lda * sizeof(float);
+
+        float *c_ptr0 = C + (y * ldc);
+
+        rows_to_compute = M-y;
+        if (rows_to_compute > 8) {
+            if (rows_to_compute % 8) {
+                rows_to_compute = 8 - 1;
+            } else {
+                rows_to_compute = 8;
+            }
+        }
+
+        for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
+            const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
+            long loops = loops_count;
+            long regs = regs_count;
+            long temp = 0;
+            long blocks = blocks_count;
+            const float *a_ptr0 = a_ptr0_base;
+            const float *b_ptr0 = B + (K_stride * x0);
+            const unsigned long ldcb = ldc * sizeof(float);
+            const float *biasptr = bias ? bias+x0 : nullbias;
+
+            switch(rows_to_compute) {
+                case 1:
+                    __asm __volatile (
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "mov z1.s, #0\n"
+                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "zip1 z16.s, z15.s, z15.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "zip2 z17.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "zip1 z18.s, z15.s, z15.s\n"
+                        "zip2 z19.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "mov z14.s, #0\n"
+                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+                        "mov z1.s, #0\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "zip1 z16.s, z13.s, z14.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "zip2 z17.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "mov z14.s, #0\n"
+                        "zip1 z18.s, z13.s, z14.s\n"
+                        "zip2 z19.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z5.s, #0\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "mov z1.s, #0\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z5.s, #0\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "mov z1.s, #0\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z5.s, #0\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "5:\n"
+                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
+                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+                        "fmax z16.s, p7/m, z16.s, z14.s\n"
+                        "fmax z17.s, p7/m, z17.s, z14.s\n"
+                        "fmax z18.s, p7/m, z18.s, z14.s\n"
+                        "fmax z19.s, p7/m, z19.s, z14.s\n"
+                        "fmin z16.s, p7/m, z16.s, z15.s\n"
+                        "fmin z17.s, p7/m, z17.s, z15.s\n"
+                        "fmin z18.s, p7/m, z18.s, z15.s\n"
+                        "fmin z19.s, p7/m, z19.s, z15.s\n"
+                        "uzp1 z0.s, z16.s, z17.s\n"
+                        "uzp1 z1.s, z18.s, z19.s\n"
+                        "st1w z0.s, p0, [%[c_ptr0]]\n"
+                        "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+                    );
+                    break;
+                case 2:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "c_ptr1 .req X1\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z15.s, z15.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "zip2 z17.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "zip1 z18.s, z15.s, z15.s\n"
+                        "zip2 z19.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z14.s, p0/z, [c_ptr1]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z13.s, z14.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "zip2 z17.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "zip1 z18.s, z13.s, z14.s\n"
+                        "zip2 z19.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "addvl a_ptr1, a_ptr1, #2\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "addvl a_ptr1, a_ptr1, #1\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "5:\n"
+                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
+                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+                        "fmax z16.s, p7/m, z16.s, z14.s\n"
+                        "fmax z17.s, p7/m, z17.s, z14.s\n"
+                        "fmax z18.s, p7/m, z18.s, z14.s\n"
+                        "fmax z19.s, p7/m, z19.s, z14.s\n"
+                        "fmin z16.s, p7/m, z16.s, z15.s\n"
+                        "fmin z17.s, p7/m, z17.s, z15.s\n"
+                        "fmin z18.s, p7/m, z18.s, z15.s\n"
+                        "fmin z19.s, p7/m, z19.s, z15.s\n"
+                        "uzp1 z0.s, z16.s, z17.s\n"
+                        "uzp2 z1.s, z16.s, z17.s\n"
+                        "uzp1 z2.s, z18.s, z19.s\n"
+                        "uzp2 z3.s, z18.s, z19.s\n"
+                        "st1w z0.s, p0, [%[c_ptr0]]\n"
+                        "st1w z1.s, p0, [c_ptr1]\n"
+                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
+                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq c_ptr1\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
+                    );
+                    break;
+                case 3:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "c_ptr1 .req X2\n"
+                        "c_ptr2 .req X3\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "mov z3.s, #0\n"
+                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z15.s, z15.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip1 z18.s, z15.s, z15.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "zip2 z19.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "mov z20.d, z16.d\n"
+                        "mov z21.d, z17.d\n"
+                        "mov z22.d, z18.d\n"
+                        "mov z23.d, z19.d\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "mov z3.s, #0\n"
+                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z14.s, p0/z, [c_ptr1]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z13.s, z14.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "zip1 z18.s, z13.s, z14.s\n"
+                        "zip2 z19.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr2]\n"
+                        "mov z14.s, #0\n"
+                        "zip1 z20.s, z13.s, z14.s\n"
+                        "zip2 z21.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "mov z14.s, #0\n"
+                        "zip1 z22.s, z13.s, z14.s\n"
+                        "zip2 z23.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z7.s, #0\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "mov z3.s, #0\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr2, a_ptr2, #2\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z7.s, #0\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "mov z3.s, #0\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr1, a_ptr1, #2\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr1, a_ptr1, #1\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "addvl a_ptr2, a_ptr2, #1\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z7.s, #0\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "5:\n"
+                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
+                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+                        "fmax z16.s, p7/m, z16.s, z14.s\n"
+                        "fmax z17.s, p7/m, z17.s, z14.s\n"
+                        "fmax z18.s, p7/m, z18.s, z14.s\n"
+                        "fmax z19.s, p7/m, z19.s, z14.s\n"
+                        "fmin z16.s, p7/m, z16.s, z15.s\n"
+                        "fmin z17.s, p7/m, z17.s, z15.s\n"
+                        "fmin z18.s, p7/m, z18.s, z15.s\n"
+                        "fmin z19.s, p7/m, z19.s, z15.s\n"
+                        "fmax z20.s, p7/m, z20.s, z14.s\n"
+                        "uzp1 z0.s, z16.s, z17.s\n"
+                        "uzp2 z1.s, z16.s, z17.s\n"
+                        "uzp1 z2.s, z18.s, z19.s\n"
+                        "uzp2 z3.s, z18.s, z19.s\n"
+                        "st1w z0.s, p0, [%[c_ptr0]]\n"
+                        "fmin z20.s, p7/m, z20.s, z15.s\n"
+                        "fmax z21.s, p7/m, z21.s, z14.s\n"
+                        "fmax z22.s, p7/m, z22.s, z14.s\n"
+                        "st1w z1.s, p0, [c_ptr1]\n"
+                        "fmax z23.s, p7/m, z23.s, z14.s\n"
+                        "fmin z21.s, p7/m, z21.s, z15.s\n"
+                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "fmin z22.s, p7/m, z22.s, z15.s\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
+                        "fmin z23.s, p7/m, z23.s, z15.s\n"
+                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "uzp1 z4.s, z20.s, z21.s\n"
+                        "uzp1 z5.s, z22.s, z23.s\n"
+                        "st1w z4.s, p0, [c_ptr2]\n"
+                        "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
+                    );
+                    break;
+                case 4:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "c_ptr1 .req X3\n"
+                        "c_ptr2 .req X4\n"
+                        "c_ptr3 .req X5\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z15.s, z15.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip1 z18.s, z15.s, z15.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "zip2 z19.s, z15.s, z15.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z20.d, z16.d\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov z21.d, z17.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "mov z22.d, z18.d\n"
+                        "mov z23.d, z19.d\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z14.s, p0/z, [c_ptr1]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z13.s, z14.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "zip1 z18.s, z13.s, z14.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "zip2 z19.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr2]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1w z14.s, p0/z, [c_ptr3]\n"
+                        "zip1 z20.s, z13.s, z14.s\n"
+                        "zip2 z21.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "zip1 z22.s, z13.s, z14.s\n"
+                        "zip2 z23.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "addvl a_ptr2, a_ptr2, #2\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "addvl a_ptr3, a_ptr3, #2\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "addvl a_ptr1, a_ptr1, #2\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "addvl a_ptr1, a_ptr1, #1\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "addvl a_ptr2, a_ptr2, #1\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "addvl a_ptr3, a_ptr3, #1\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "5:\n"
+                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
+                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+                        "fmax z16.s, p7/m, z16.s, z14.s\n"
+                        "fmax z17.s, p7/m, z17.s, z14.s\n"
+                        "fmax z18.s, p7/m, z18.s, z14.s\n"
+                        "fmax z19.s, p7/m, z19.s, z14.s\n"
+                        "fmin z16.s, p7/m, z16.s, z15.s\n"
+                        "fmin z17.s, p7/m, z17.s, z15.s\n"
+                        "fmin z18.s, p7/m, z18.s, z15.s\n"
+                        "fmin z19.s, p7/m, z19.s, z15.s\n"
+                        "fmax z20.s, p7/m, z20.s, z14.s\n"
+                        "uzp1 z0.s, z16.s, z17.s\n"
+                        "uzp2 z1.s, z16.s, z17.s\n"
+                        "uzp1 z2.s, z18.s, z19.s\n"
+                        "uzp2 z3.s, z18.s, z19.s\n"
+                        "st1w z0.s, p0, [%[c_ptr0]]\n"
+                        "fmin z20.s, p7/m, z20.s, z15.s\n"
+                        "fmax z21.s, p7/m, z21.s, z14.s\n"
+                        "fmax z22.s, p7/m, z22.s, z14.s\n"
+                        "st1w z1.s, p0, [c_ptr1]\n"
+                        "fmax z23.s, p7/m, z23.s, z14.s\n"
+                        "fmin z21.s, p7/m, z21.s, z15.s\n"
+                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "fmin z22.s, p7/m, z22.s, z15.s\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
+                        "fmin z23.s, p7/m, z23.s, z15.s\n"
+                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "uzp1 z4.s, z20.s, z21.s\n"
+                        "uzp2 z5.s, z20.s, z21.s\n"
+                        "uzp1 z6.s, z22.s, z23.s\n"
+                        "st1w z4.s, p0, [c_ptr2]\n"
+                        "uzp2 z7.s, z22.s, z23.s\n"
+                        "st1w z5.s, p0, [c_ptr3]\n"
+                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
+                    );
+                    break;
+                case 5:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "a_ptr4 .req X3\n"
+                        "c_ptr1 .req X4\n"
+                        "c_ptr2 .req X5\n"
+                        "c_ptr3 .req X6\n"
+                        "c_ptr4 .req X7\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "add a_ptr4, a_ptr3, %[lda]\n"
+                        "add c_ptr4, c_ptr3, %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "mov z5.s, #0\n"
+                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z15.s, z15.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip1 z18.s, z15.s, z15.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "zip2 z19.s, z15.s, z15.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z20.d, z16.d\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov z21.d, z17.d\n"
+                        "add a_ptr4, a_ptr4, #0x10\n"
+                        "mov z22.d, z18.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "mov z23.d, z19.d\n"
+                        "mov z24.d, z16.d\n"
+                        "mov z25.d, z17.d\n"
+                        "mov z26.d, z18.d\n"
+                        "mov z27.d, z19.d\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "mov z5.s, #0\n"
+                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z14.s, p0/z, [c_ptr1]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z13.s, z14.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "zip1 z18.s, z13.s, z14.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "zip2 z19.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr2]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1w z14.s, p0/z, [c_ptr3]\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr4, a_ptr4, #0x10\n"
+                        "zip1 z20.s, z13.s, z14.s\n"
+                        "zip2 z21.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "zip1 z22.s, z13.s, z14.s\n"
+                        "zip2 z23.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr4]\n"
+                        "mov z14.s, #0\n"
+                        "zip1 z24.s, z13.s, z14.s\n"
+                        "zip2 z25.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
+                        "mov z14.s, #0\n"
+                        "zip1 z26.s, z13.s, z14.s\n"
+                        "zip2 z27.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z9.s, #0\n"
+                        "add a_ptr4, a_ptr4, #0x20\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "mov z5.s, #0\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr3, a_ptr3, #2\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z9.s, #0\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "addvl a_ptr4, a_ptr4, #2\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "mov z5.s, #0\n"
+                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl a_ptr1, a_ptr1, #2\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr2, a_ptr2, #2\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr1, a_ptr1, #1\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "addvl a_ptr2, a_ptr2, #1\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "addvl a_ptr3, a_ptr3, #1\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "addvl a_ptr4, a_ptr4, #1\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z9.s, #0\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "5:\n"
+                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
+                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+                        "fmax z16.s, p7/m, z16.s, z14.s\n"
+                        "fmax z17.s, p7/m, z17.s, z14.s\n"
+                        "fmax z18.s, p7/m, z18.s, z14.s\n"
+                        "fmax z19.s, p7/m, z19.s, z14.s\n"
+                        "fmin z16.s, p7/m, z16.s, z15.s\n"
+                        "fmin z17.s, p7/m, z17.s, z15.s\n"
+                        "fmin z18.s, p7/m, z18.s, z15.s\n"
+                        "fmin z19.s, p7/m, z19.s, z15.s\n"
+                        "fmax z20.s, p7/m, z20.s, z14.s\n"
+                        "uzp1 z0.s, z16.s, z17.s\n"
+                        "uzp2 z1.s, z16.s, z17.s\n"
+                        "uzp1 z2.s, z18.s, z19.s\n"
+                        "uzp2 z3.s, z18.s, z19.s\n"
+                        "st1w z0.s, p0, [%[c_ptr0]]\n"
+                        "fmin z20.s, p7/m, z20.s, z15.s\n"
+                        "fmax z21.s, p7/m, z21.s, z14.s\n"
+                        "fmax z22.s, p7/m, z22.s, z14.s\n"
+                        "st1w z1.s, p0, [c_ptr1]\n"
+                        "fmax z23.s, p7/m, z23.s, z14.s\n"
+                        "fmax z24.s, p7/m, z24.s, z14.s\n"
+                        "fmin z21.s, p7/m, z21.s, z15.s\n"
+                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "fmin z22.s, p7/m, z22.s, z15.s\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
+                        "fmin z23.s, p7/m, z23.s, z15.s\n"
+                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "uzp1 z4.s, z20.s, z21.s\n"
+                        "uzp2 z5.s, z20.s, z21.s\n"
+                        "fmin z24.s, p7/m, z24.s, z15.s\n"
+                        "uzp1 z6.s, z22.s, z23.s\n"
+                        "st1w z4.s, p0, [c_ptr2]\n"
+                        "uzp2 z7.s, z22.s, z23.s\n"
+                        "fmax z25.s, p7/m, z25.s, z14.s\n"
+                        "fmax z26.s, p7/m, z26.s, z14.s\n"
+                        "st1w z5.s, p0, [c_ptr3]\n"
+                        "fmax z27.s, p7/m, z27.s, z14.s\n"
+                        "fmin z25.s, p7/m, z25.s, z15.s\n"
+                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "fmin z26.s, p7/m, z26.s, z15.s\n"
+                        "fmin z27.s, p7/m, z27.s, z15.s\n"
+                        "uzp1 z8.s, z24.s, z25.s\n"
+                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "uzp1 z9.s, z26.s, z27.s\n"
+                        "st1w z8.s, p0, [c_ptr4]\n"
+                        "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq a_ptr4\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        ".unreq c_ptr4\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
+                    );
+                    break;
+                case 6:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "a_ptr4 .req X3\n"
+                        "a_ptr5 .req X4\n"
+                        "c_ptr1 .req X5\n"
+                        "c_ptr2 .req X6\n"
+                        "c_ptr3 .req X7\n"
+                        "c_ptr4 .req X8\n"
+                        "c_ptr5 .req X9\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "add a_ptr4, a_ptr3, %[lda]\n"
+                        "add c_ptr4, c_ptr3, %[ldc]\n"
+                        "add a_ptr5, a_ptr4, %[lda]\n"
+                        "add c_ptr5, c_ptr4, %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z15.s, z15.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip1 z18.s, z15.s, z15.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+                        "zip2 z19.s, z15.s, z15.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z20.d, z16.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z21.d, z17.d\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov z22.d, z18.d\n"
+                        "add a_ptr4, a_ptr4, #0x10\n"
+                        "mov z23.d, z19.d\n"
+                        "add a_ptr5, a_ptr5, #0x10\n"
+                        "mov z24.d, z16.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "mov z25.d, z17.d\n"
+                        "mov z26.d, z18.d\n"
+                        "mov z27.d, z19.d\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z14.s, p0/z, [c_ptr1]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z13.s, z14.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "zip1 z18.s, z13.s, z14.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+                        "zip2 z19.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr2]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1w z14.s, p0/z, [c_ptr3]\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "add a_ptr4, a_ptr4, #0x10\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "zip1 z20.s, z13.s, z14.s\n"
+                        "add a_ptr5, a_ptr5, #0x10\n"
+                        "zip2 z21.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "zip1 z22.s, z13.s, z14.s\n"
+                        "zip2 z23.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr4]\n"
+                        "ld1w z14.s, p0/z, [c_ptr5]\n"
+                        "zip1 z24.s, z13.s, z14.s\n"
+                        "zip2 z25.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
+                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
+                        "zip1 z26.s, z13.s, z14.s\n"
+                        "zip2 z27.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "add a_ptr4, a_ptr4, #0x20\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "add a_ptr5, a_ptr5, #0x20\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr3, a_ptr3, #2\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "addvl a_ptr4, a_ptr4, #2\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "addvl a_ptr5, a_ptr5, #2\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr1, a_ptr1, #2\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "addvl a_ptr2, a_ptr2, #2\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr1, a_ptr1, #1\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "addvl a_ptr2, a_ptr2, #1\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "addvl a_ptr3, a_ptr3, #1\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "addvl a_ptr4, a_ptr4, #1\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1rqw z9.s, p6/z, [a_ptr5]\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "addvl a_ptr5, a_ptr5, #1\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "5:\n"
+                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
+                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+                        "fmax z16.s, p7/m, z16.s, z14.s\n"
+                        "fmax z17.s, p7/m, z17.s, z14.s\n"
+                        "fmax z18.s, p7/m, z18.s, z14.s\n"
+                        "fmax z19.s, p7/m, z19.s, z14.s\n"
+                        "fmin z16.s, p7/m, z16.s, z15.s\n"
+                        "fmin z17.s, p7/m, z17.s, z15.s\n"
+                        "fmin z18.s, p7/m, z18.s, z15.s\n"
+                        "fmin z19.s, p7/m, z19.s, z15.s\n"
+                        "fmax z20.s, p7/m, z20.s, z14.s\n"
+                        "uzp1 z0.s, z16.s, z17.s\n"
+                        "uzp2 z1.s, z16.s, z17.s\n"
+                        "uzp1 z2.s, z18.s, z19.s\n"
+                        "uzp2 z3.s, z18.s, z19.s\n"
+                        "st1w z0.s, p0, [%[c_ptr0]]\n"
+                        "fmin z20.s, p7/m, z20.s, z15.s\n"
+                        "fmax z21.s, p7/m, z21.s, z14.s\n"
+                        "fmax z22.s, p7/m, z22.s, z14.s\n"
+                        "st1w z1.s, p0, [c_ptr1]\n"
+                        "fmax z23.s, p7/m, z23.s, z14.s\n"
+                        "fmax z24.s, p7/m, z24.s, z14.s\n"
+                        "fmin z21.s, p7/m, z21.s, z15.s\n"
+                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "fmin z22.s, p7/m, z22.s, z15.s\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
+                        "fmin z23.s, p7/m, z23.s, z15.s\n"
+                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "uzp1 z4.s, z20.s, z21.s\n"
+                        "uzp2 z5.s, z20.s, z21.s\n"
+                        "fmin z24.s, p7/m, z24.s, z15.s\n"
+                        "uzp1 z6.s, z22.s, z23.s\n"
+                        "st1w z4.s, p0, [c_ptr2]\n"
+                        "uzp2 z7.s, z22.s, z23.s\n"
+                        "fmax z25.s, p7/m, z25.s, z14.s\n"
+                        "fmax z26.s, p7/m, z26.s, z14.s\n"
+                        "st1w z5.s, p0, [c_ptr3]\n"
+                        "fmax z27.s, p7/m, z27.s, z14.s\n"
+                        "fmin z25.s, p7/m, z25.s, z15.s\n"
+                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "fmin z26.s, p7/m, z26.s, z15.s\n"
+                        "fmin z27.s, p7/m, z27.s, z15.s\n"
+                        "uzp1 z8.s, z24.s, z25.s\n"
+                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "uzp2 z9.s, z24.s, z25.s\n"
+                        "uzp1 z10.s, z26.s, z27.s\n"
+                        "uzp2 z11.s, z26.s, z27.s\n"
+                        "st1w z8.s, p0, [c_ptr4]\n"
+                        "st1w z9.s, p0, [c_ptr5]\n"
+                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
+                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq a_ptr4\n"
+                        ".unreq a_ptr5\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        ".unreq c_ptr4\n"
+                        ".unreq c_ptr5\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
+                    );
+                    break;
+                case 7:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "a_ptr4 .req X3\n"
+                        "a_ptr5 .req X4\n"
+                        "a_ptr6 .req X5\n"
+                        "c_ptr1 .req X6\n"
+                        "c_ptr2 .req X7\n"
+                        "c_ptr3 .req X8\n"
+                        "c_ptr4 .req X9\n"
+                        "c_ptr5 .req X10\n"
+                        "c_ptr6 .req X11\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "add a_ptr4, a_ptr3, %[lda]\n"
+                        "add c_ptr4, c_ptr3, %[ldc]\n"
+                        "add a_ptr5, a_ptr4, %[lda]\n"
+                        "add c_ptr5, c_ptr4, %[ldc]\n"
+                        "add a_ptr6, a_ptr5, %[lda]\n"
+                        "add c_ptr6, c_ptr5, %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "mov z7.s, #0\n"
+                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z15.s, z15.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip1 z18.s, z15.s, z15.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+                        "zip2 z19.s, z15.s, z15.s\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "mov z20.d, z16.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "trn1 z11.d, z6.d, z7.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z21.d, z17.d\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov z22.d, z18.d\n"
+                        "add a_ptr4, a_ptr4, #0x10\n"
+                        "mov z23.d, z19.d\n"
+                        "add a_ptr5, a_ptr5, #0x10\n"
+                        "mov z24.d, z16.d\n"
+                        "add a_ptr6, a_ptr6, #0x10\n"
+                        "mov z25.d, z17.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "mov z26.d, z18.d\n"
+                        "mov z27.d, z19.d\n"
+                        "mov z28.d, z16.d\n"
+                        "mov z29.d, z17.d\n"
+                        "mov z30.d, z18.d\n"
+                        "mov z31.d, z19.d\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "mov z7.s, #0\n"
+                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z14.s, p0/z, [c_ptr1]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z13.s, z14.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "zip1 z18.s, z13.s, z14.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+                        "zip2 z19.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr2]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1w z14.s, p0/z, [c_ptr3]\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
+                        "add a_ptr4, a_ptr4, #0x10\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "zip1 z20.s, z13.s, z14.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "zip2 z21.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "trn1 z11.d, z6.d, z7.d\n"
+                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "add a_ptr5, a_ptr5, #0x10\n"
+                        "add a_ptr6, a_ptr6, #0x10\n"
+                        "zip1 z22.s, z13.s, z14.s\n"
+                        "zip2 z23.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr4]\n"
+                        "ld1w z14.s, p0/z, [c_ptr5]\n"
+                        "zip1 z24.s, z13.s, z14.s\n"
+                        "zip2 z25.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
+                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
+                        "zip1 z26.s, z13.s, z14.s\n"
+                        "zip2 z27.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr6]\n"
+                        "mov z14.s, #0\n"
+                        "zip1 z28.s, z13.s, z14.s\n"
+                        "zip2 z29.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
+                        "mov z14.s, #0\n"
+                        "zip1 z30.s, z13.s, z14.s\n"
+                        "zip2 z31.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "trn2 z3.d, z6.d, z7.d\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "add a_ptr4, a_ptr4, #0x20\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "add a_ptr5, a_ptr5, #0x20\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z11.s, #0\n"
+                        "add a_ptr6, a_ptr6, #0x20\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z3.d, z10.d, z11.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "trn2 z11.d, z10.d, z11.d\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "mov z7.s, #0\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "trn1 z11.d, z6.d, z7.d\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "trn2 z3.d, z6.d, z7.d\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z11.s, #0\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z3.d, z10.d, z11.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "trn2 z11.d, z10.d, z11.d\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "addvl a_ptr4, a_ptr4, #2\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "addvl a_ptr5, a_ptr5, #2\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        "addvl a_ptr6, a_ptr6, #2\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        "addvl a_ptr1, a_ptr1, #2\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        "mov z7.s, #0\n"
+                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl a_ptr2, a_ptr2, #2\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr3, a_ptr3, #2\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "trn1 z11.d, z6.d, z7.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "trn2 z3.d, z6.d, z7.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "trn2 z3.d, z6.d, z7.d\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "addvl a_ptr1, a_ptr1, #1\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "addvl a_ptr2, a_ptr2, #1\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "addvl a_ptr3, a_ptr3, #1\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1rqw z9.s, p6/z, [a_ptr5]\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "addvl a_ptr4, a_ptr4, #1\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "addvl a_ptr5, a_ptr5, #1\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1rqw z10.s, p6/z, [a_ptr6]\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z11.s, #0\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "addvl a_ptr6, a_ptr6, #1\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "trn1 z3.d, z10.d, z11.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z11.d, z10.d, z11.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "5:\n"
+                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
+                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+                        "fmax z16.s, p7/m, z16.s, z14.s\n"
+                        "fmax z17.s, p7/m, z17.s, z14.s\n"
+                        "fmax z18.s, p7/m, z18.s, z14.s\n"
+                        "fmax z19.s, p7/m, z19.s, z14.s\n"
+                        "fmin z16.s, p7/m, z16.s, z15.s\n"
+                        "fmin z17.s, p7/m, z17.s, z15.s\n"
+                        "fmin z18.s, p7/m, z18.s, z15.s\n"
+                        "fmin z19.s, p7/m, z19.s, z15.s\n"
+                        "fmax z20.s, p7/m, z20.s, z14.s\n"
+                        "uzp1 z0.s, z16.s, z17.s\n"
+                        "uzp2 z1.s, z16.s, z17.s\n"
+                        "uzp1 z2.s, z18.s, z19.s\n"
+                        "uzp2 z3.s, z18.s, z19.s\n"
+                        "st1w z0.s, p0, [%[c_ptr0]]\n"
+                        "fmin z20.s, p7/m, z20.s, z15.s\n"
+                        "fmax z21.s, p7/m, z21.s, z14.s\n"
+                        "fmax z22.s, p7/m, z22.s, z14.s\n"
+                        "st1w z1.s, p0, [c_ptr1]\n"
+                        "fmax z23.s, p7/m, z23.s, z14.s\n"
+                        "fmax z24.s, p7/m, z24.s, z14.s\n"
+                        "fmin z21.s, p7/m, z21.s, z15.s\n"
+                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "fmin z22.s, p7/m, z22.s, z15.s\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
+                        "fmin z23.s, p7/m, z23.s, z15.s\n"
+                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "uzp1 z4.s, z20.s, z21.s\n"
+                        "uzp2 z5.s, z20.s, z21.s\n"
+                        "fmin z24.s, p7/m, z24.s, z15.s\n"
+                        "uzp1 z6.s, z22.s, z23.s\n"
+                        "st1w z4.s, p0, [c_ptr2]\n"
+                        "uzp2 z7.s, z22.s, z23.s\n"
+                        "fmax z25.s, p7/m, z25.s, z14.s\n"
+                        "fmax z26.s, p7/m, z26.s, z14.s\n"
+                        "st1w z5.s, p0, [c_ptr3]\n"
+                        "fmax z27.s, p7/m, z27.s, z14.s\n"
+                        "fmax z28.s, p7/m, z28.s, z14.s\n"
+                        "fmin z25.s, p7/m, z25.s, z15.s\n"
+                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "fmin z26.s, p7/m, z26.s, z15.s\n"
+                        "fmin z27.s, p7/m, z27.s, z15.s\n"
+                        "fmin z28.s, p7/m, z28.s, z15.s\n"
+                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "uzp1 z8.s, z24.s, z25.s\n"
+                        "uzp2 z9.s, z24.s, z25.s\n"
+                        "uzp1 z10.s, z26.s, z27.s\n"
+                        "uzp2 z11.s, z26.s, z27.s\n"
+                        "st1w z8.s, p0, [c_ptr4]\n"
+                        "fmax z29.s, p7/m, z29.s, z14.s\n"
+                        "fmax z30.s, p7/m, z30.s, z14.s\n"
+                        "fmax z31.s, p7/m, z31.s, z14.s\n"
+                        "st1w z9.s, p0, [c_ptr5]\n"
+                        "fmin z29.s, p7/m, z29.s, z15.s\n"
+                        "fmin z30.s, p7/m, z30.s, z15.s\n"
+                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
+                        "fmin z31.s, p7/m, z31.s, z15.s\n"
+                        "uzp1 z12.s, z28.s, z29.s\n"
+                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
+                        "uzp1 z13.s, z30.s, z31.s\n"
+                        "st1w z12.s, p0, [c_ptr6]\n"
+                        "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq a_ptr4\n"
+                        ".unreq a_ptr5\n"
+                        ".unreq a_ptr6\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        ".unreq c_ptr4\n"
+                        ".unreq c_ptr5\n"
+                        ".unreq c_ptr6\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
+                    );
+                    break;
+                default:
+                case 8:
+                    __asm __volatile (
+                        "a_ptr1 .req X0\n"
+                        "a_ptr2 .req X1\n"
+                        "a_ptr3 .req X2\n"
+                        "a_ptr4 .req X3\n"
+                        "a_ptr5 .req X4\n"
+                        "a_ptr6 .req X5\n"
+                        "a_ptr7 .req X6\n"
+                        "c_ptr1 .req X7\n"
+                        "c_ptr2 .req X8\n"
+                        "c_ptr3 .req X9\n"
+                        "c_ptr4 .req X10\n"
+                        "c_ptr5 .req X11\n"
+                        "c_ptr6 .req X12\n"
+                        "c_ptr7 .req X13\n"
+                        "add a_ptr1, %[a_ptr0], %[lda]\n"
+                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
+                        "add a_ptr2, a_ptr1, %[lda]\n"
+                        "add c_ptr2, c_ptr1, %[ldc]\n"
+                        "add a_ptr3, a_ptr2, %[lda]\n"
+                        "add c_ptr3, c_ptr2, %[ldc]\n"
+                        "add a_ptr4, a_ptr3, %[lda]\n"
+                        "add c_ptr4, c_ptr3, %[ldc]\n"
+                        "add a_ptr5, a_ptr4, %[lda]\n"
+                        "add c_ptr5, c_ptr4, %[ldc]\n"
+                        "add a_ptr6, a_ptr5, %[lda]\n"
+                        "add c_ptr6, c_ptr5, %[ldc]\n"
+                        "add a_ptr7, a_ptr6, %[lda]\n"
+                        "add c_ptr7, c_ptr6, %[ldc]\n"
+                        "whilelt p6.s, %[temp], %[leftovers]\n"
+                        "whilelt p0.s, %[temp], %[width]\n"
+                        "incw %[temp], all, mul #1\n"
+                        "ptrue p7.s\n"
+                        "whilelt p1.s, %[temp], %[width]\n"
+                        "cbnz %[accumulate], 1f\n"
+                        "ld1w z15.s, p0/z, [%[biasptr]]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z15.s, z15.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z15.s, z15.s\n"
+                        "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "zip1 z18.s, z15.s, z15.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+                        "zip2 z19.s, z15.s, z15.s\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr7]\n"
+                        "mov z20.d, z16.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "mov z21.d, z17.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "trn1 z11.d, z6.d, z7.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "mov z22.d, z18.d\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "mov z23.d, z19.d\n"
+                        "add a_ptr4, a_ptr4, #0x10\n"
+                        "mov z24.d, z16.d\n"
+                        "add a_ptr5, a_ptr5, #0x10\n"
+                        "mov z25.d, z17.d\n"
+                        "add a_ptr6, a_ptr6, #0x10\n"
+                        "mov z26.d, z18.d\n"
+                        "add a_ptr7, a_ptr7, #0x10\n"
+                        "mov z27.d, z19.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "mov z28.d, z16.d\n"
+                        "mov z29.d, z17.d\n"
+                        "mov z30.d, z18.d\n"
+                        "mov z31.d, z19.d\n"
+                        "cbz %[loops], 2f\n"
+                        "b 3f\n"
+                        "1:\n"
+                        "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
+                        "ld1w z14.s, p0/z, [c_ptr1]\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
+                        "add a_ptr1, a_ptr1, #0x10\n"
+                        "zip1 z16.s, z13.s, z14.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
+                        "zip2 z17.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
+                        "add a_ptr2, a_ptr2, #0x10\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4]\n"
+                        "add a_ptr3, a_ptr3, #0x10\n"
+                        "zip1 z18.s, z13.s, z14.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr5]\n"
+                        "zip2 z19.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr2]\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        "ld1w z14.s, p0/z, [c_ptr3]\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr6]\n"
+                        "add a_ptr4, a_ptr4, #0x10\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr7]\n"
+                        "zip1 z20.s, z13.s, z14.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "zip2 z21.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
+                        "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
+                        "add a_ptr5, a_ptr5, #0x10\n"
+                        "trn1 z11.d, z6.d, z7.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        "add a_ptr6, a_ptr6, #0x10\n"
+                        "zip1 z22.s, z13.s, z14.s\n"
+                        "add a_ptr7, a_ptr7, #0x10\n"
+                        "zip2 z23.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr4]\n"
+                        "ld1w z14.s, p0/z, [c_ptr5]\n"
+                        "zip1 z24.s, z13.s, z14.s\n"
+                        "zip2 z25.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
+                        "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
+                        "zip1 z26.s, z13.s, z14.s\n"
+                        "zip2 z27.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p0/z, [c_ptr6]\n"
+                        "ld1w z14.s, p0/z, [c_ptr7]\n"
+                        "zip1 z28.s, z13.s, z14.s\n"
+                        "zip2 z29.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
+                        "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n"
+                        "zip1 z30.s, z13.s, z14.s\n"
+                        "zip2 z31.s, z13.s, z14.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        "cbz %[loops], 2f\n"
+                        "3:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "subs %[loops], %[loops], #0x1\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "trn2 z3.d, z6.d, z7.d\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "add a_ptr1, a_ptr1, #0x20\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "add a_ptr2, a_ptr2, #0x20\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "add a_ptr3, a_ptr3, #0x20\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "add a_ptr4, a_ptr4, #0x20\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "add a_ptr5, a_ptr5, #0x20\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z11.s, p7/z, [a_ptr7]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "add a_ptr6, a_ptr6, #0x20\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "add a_ptr7, a_ptr7, #0x20\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z3.d, z10.d, z11.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "trn2 z11.d, z10.d, z11.d\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr7, #-0x10]\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
+                        "trn1 z11.d, z6.d, z7.d\n"
+                        "b.ne 3b\n"
+                        "2:\n"
+                        "cbz %[regs], 4f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
+                        "trn2 z3.d, z6.d, z7.d\n"
+                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p7/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1rqw z9.s, p7/z, [a_ptr5]\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1rqw z10.s, p7/z, [a_ptr6]\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z11.s, p7/z, [a_ptr7]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        "trn1 z3.d, z10.d, z11.d\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #16\n"
+                        "trn2 z11.d, z10.d, z11.d\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "addvl a_ptr4, a_ptr4, #2\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        "addvl a_ptr5, a_ptr5, #2\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        "addvl a_ptr6, a_ptr6, #2\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        "addvl a_ptr1, a_ptr1, #2\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        "addvl a_ptr7, a_ptr7, #2\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #-4\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl a_ptr2, a_ptr2, #2\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "addvl a_ptr3, a_ptr3, #2\n"
+                        "trn1 z8.d, z0.d, z1.d\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "trn1 z9.d, z2.d, z3.d\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "trn1 z10.d, z4.d, z5.d\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "trn1 z11.d, z6.d, z7.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "trn2 z3.d, z6.d, z7.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "b 5f\n"
+                        "4:\n"
+                        "trn2 z0.d, z0.d, z1.d\n"
+                        "trn2 z1.d, z2.d, z3.d\n"
+                        "trn2 z2.d, z4.d, z5.d\n"
+                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
+                        "trn2 z3.d, z6.d, z7.d\n"
+                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        "ld1rqw z8.s, p6/z, [a_ptr4]\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        "addvl a_ptr1, a_ptr1, #1\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        "addvl a_ptr2, a_ptr2, #1\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        "addvl a_ptr3, a_ptr3, #1\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        "ld1rqw z9.s, p6/z, [a_ptr5]\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        "addvl a_ptr4, a_ptr4, #1\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        "addvl a_ptr5, a_ptr5, #1\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        "ld1rqw z10.s, p6/z, [a_ptr6]\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "ld1rqw z11.s, p6/z, [a_ptr7]\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        "addvl %[b_ptr0], %[b_ptr0], #4\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        "addvl a_ptr6, a_ptr6, #1\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        "addvl a_ptr7, a_ptr7, #1\n"
+                        "trn1 z0.d, z4.d, z5.d\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        "trn1 z1.d, z6.d, z7.d\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        "trn1 z2.d, z8.d, z9.d\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "trn1 z3.d, z10.d, z11.d\n"
+                        "cbz %[blocks], 5f\n"
+                        "trn2 z11.d, z10.d, z11.d\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
+                        "trn2 z10.d, z8.d, z9.d\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+                        "trn2 z9.d, z6.d, z7.d\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+                        "trn2 z8.d, z4.d, z5.d\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+                        ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
+                        "subs %[blocks], %[blocks], #0x1\n"
+                        ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
+                        ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
+                        ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
+                        ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
+                        ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
+                        ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
+                        ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
+                        ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
+                        ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
+                        ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
+                        ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
+                        ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
+                        ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
+                        ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
+                        ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
+                        "b.eq 5f\n"
+                        "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+                        "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+                        "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+                        "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+                        ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
+                        ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
+                        ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
+                        ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
+                        ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
+                        ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
+                        ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
+                        ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
+                        ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
+                        ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
+                        ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
+                        ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
+                        ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
+                        ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
+                        ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
+                        ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
+                        "5:\n"
+                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
+                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
+                        "fmax z16.s, p7/m, z16.s, z14.s\n"
+                        "fmax z17.s, p7/m, z17.s, z14.s\n"
+                        "fmax z18.s, p7/m, z18.s, z14.s\n"
+                        "fmax z19.s, p7/m, z19.s, z14.s\n"
+                        "fmin z16.s, p7/m, z16.s, z15.s\n"
+                        "fmin z17.s, p7/m, z17.s, z15.s\n"
+                        "fmin z18.s, p7/m, z18.s, z15.s\n"
+                        "fmin z19.s, p7/m, z19.s, z15.s\n"
+                        "fmax z20.s, p7/m, z20.s, z14.s\n"
+                        "uzp1 z0.s, z16.s, z17.s\n"
+                        "uzp2 z1.s, z16.s, z17.s\n"
+                        "uzp1 z2.s, z18.s, z19.s\n"
+                        "uzp2 z3.s, z18.s, z19.s\n"
+                        "st1w z0.s, p0, [%[c_ptr0]]\n"
+                        "fmin z20.s, p7/m, z20.s, z15.s\n"
+                        "fmax z21.s, p7/m, z21.s, z14.s\n"
+                        "fmax z22.s, p7/m, z22.s, z14.s\n"
+                        "st1w z1.s, p0, [c_ptr1]\n"
+                        "fmax z23.s, p7/m, z23.s, z14.s\n"
+                        "fmax z24.s, p7/m, z24.s, z14.s\n"
+                        "fmin z21.s, p7/m, z21.s, z15.s\n"
+                        "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
+                        "fmin z22.s, p7/m, z22.s, z15.s\n"
+                        "addvl %[c_ptr0], %[c_ptr0], #2\n"
+                        "fmin z23.s, p7/m, z23.s, z15.s\n"
+                        "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
+                        "uzp1 z4.s, z20.s, z21.s\n"
+                        "uzp2 z5.s, z20.s, z21.s\n"
+                        "fmin z24.s, p7/m, z24.s, z15.s\n"
+                        "uzp1 z6.s, z22.s, z23.s\n"
+                        "st1w z4.s, p0, [c_ptr2]\n"
+                        "uzp2 z7.s, z22.s, z23.s\n"
+                        "fmax z25.s, p7/m, z25.s, z14.s\n"
+                        "fmax z26.s, p7/m, z26.s, z14.s\n"
+                        "st1w z5.s, p0, [c_ptr3]\n"
+                        "fmax z27.s, p7/m, z27.s, z14.s\n"
+                        "fmax z28.s, p7/m, z28.s, z14.s\n"
+                        "fmin z25.s, p7/m, z25.s, z15.s\n"
+                        "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
+                        "fmin z26.s, p7/m, z26.s, z15.s\n"
+                        "fmin z27.s, p7/m, z27.s, z15.s\n"
+                        "fmin z28.s, p7/m, z28.s, z15.s\n"
+                        "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
+                        "uzp1 z8.s, z24.s, z25.s\n"
+                        "uzp2 z9.s, z24.s, z25.s\n"
+                        "uzp1 z10.s, z26.s, z27.s\n"
+                        "uzp2 z11.s, z26.s, z27.s\n"
+                        "st1w z8.s, p0, [c_ptr4]\n"
+                        "fmax z29.s, p7/m, z29.s, z14.s\n"
+                        "fmax z30.s, p7/m, z30.s, z14.s\n"
+                        "fmax z31.s, p7/m, z31.s, z14.s\n"
+                        "st1w z9.s, p0, [c_ptr5]\n"
+                        "fmin z29.s, p7/m, z29.s, z15.s\n"
+                        "fmin z30.s, p7/m, z30.s, z15.s\n"
+                        "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
+                        "fmin z31.s, p7/m, z31.s, z15.s\n"
+                        "uzp1 z12.s, z28.s, z29.s\n"
+                        "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
+                        "uzp2 z13.s, z28.s, z29.s\n"
+                        "uzp1 z14.s, z30.s, z31.s\n"
+                        "uzp2 z15.s, z30.s, z31.s\n"
+                        "st1w z12.s, p0, [c_ptr6]\n"
+                        "st1w z13.s, p0, [c_ptr7]\n"
+                        "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n"
+                        "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n"
+                        ".unreq a_ptr1\n"
+                        ".unreq a_ptr2\n"
+                        ".unreq a_ptr3\n"
+                        ".unreq a_ptr4\n"
+                        ".unreq a_ptr5\n"
+                        ".unreq a_ptr6\n"
+                        ".unreq a_ptr7\n"
+                        ".unreq c_ptr1\n"
+                        ".unreq c_ptr2\n"
+                        ".unreq c_ptr3\n"
+                        ".unreq c_ptr4\n"
+                        ".unreq c_ptr5\n"
+                        ".unreq c_ptr6\n"
+                        ".unreq c_ptr7\n"
+                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
+                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
+                    );
+                    break;
+            }
+
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
index d842210..c500f43 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }
@@ -78,7 +78,10 @@
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_s8s32_dot_4VLx4;
 
-    hybrid_s8s32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+    hybrid_s8s32_dot_4VLx4(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
index d57557a..b30b884 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool append) {
+void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
     const int K_stride = ((K + 3) / 4) * 4;
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
@@ -41,12 +41,23 @@
     const long leftovers = K;
     const long blocks_count = (K + 3) / 4;
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const int8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(int8_t);
 
         int32_t *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
             const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
             long loops = loops_count;
@@ -57,7 +68,7 @@
             const int8_t *b_ptr0 = B + (K_stride * x0);
             const unsigned long ldcb = ldc * sizeof(int32_t);
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "whilelt p6.b, %[temp], %[leftovers]\n"
@@ -69,7 +80,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z16.s, #0\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mov z17.s, #0\n"
@@ -355,7 +366,7 @@
                         "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -374,7 +385,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z16.s, #0\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mov z17.s, #0\n"
@@ -798,7 +809,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -821,7 +832,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z16.s, #0\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mov z17.s, #0\n"
@@ -1383,7 +1394,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -1411,7 +1422,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z16.s, #0\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mov z17.s, #0\n"
@@ -2111,7 +2122,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
index 5dab1da..c325e52 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return true;
     }
@@ -78,7 +78,10 @@
     // Default to the generic kernel
     kern_type kernel=sve_hybrid_u8u32_dot_4VLx4;
 
-    hybrid_u8u32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
+    hybrid_u8u32_dot_4VLx4(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
index bf3e8ca..565832e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@
 
 namespace arm_gemm {
 
-void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool append) {
+void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
     const int K_stride = ((K + 3) / 4) * 4;
     const long loops_count = ((K + 16) / 32) - 1;
     K -= loops_count * 32;
@@ -41,12 +41,23 @@
     const long leftovers = K;
     const long blocks_count = (K + 3) / 4;
 
-    for (int y=0; y<M; y+=4) {
+    int rows_to_compute;
+
+    for (int y=0; y<M; y+=rows_to_compute) {
         const uint8_t * const a_ptr0_base = A + (y * lda);
         const unsigned long ldab = lda * sizeof(uint8_t);
 
         uint32_t *c_ptr0 = C + (y * ldc);
 
+        rows_to_compute = M-y;
+        if (rows_to_compute > 4) {
+            if (rows_to_compute % 4) {
+                rows_to_compute = 4 - 1;
+            } else {
+                rows_to_compute = 4;
+            }
+        }
+
         for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
             const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
             long loops = loops_count;
@@ -57,7 +68,7 @@
             const uint8_t *b_ptr0 = B + (K_stride * x0);
             const unsigned long ldcb = ldc * sizeof(uint32_t);
 
-            switch(M-y) {
+            switch(rows_to_compute) {
                 case 1:
                     __asm __volatile (
                         "whilelt p6.b, %[temp], %[leftovers]\n"
@@ -69,7 +80,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z16.s, #0\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mov z17.s, #0\n"
@@ -355,7 +366,7 @@
                         "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
                         "addvl %[c_ptr0], %[c_ptr0], #4\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
                     );
                     break;
@@ -374,7 +385,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z16.s, #0\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mov z17.s, #0\n"
@@ -798,7 +809,7 @@
                         ".unreq a_ptr1\n"
                         ".unreq c_ptr1\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
                     );
                     break;
@@ -821,7 +832,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z16.s, #0\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mov z17.s, #0\n"
@@ -1383,7 +1394,7 @@
                         ".unreq c_ptr1\n"
                         ".unreq c_ptr2\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
                     );
                     break;
@@ -1411,7 +1422,7 @@
                         "whilelt p2.s, %[temp], %[width]\n"
                         "incw %[temp], all, mul #1\n"
                         "whilelt p3.s, %[temp], %[width]\n"
-                        "cbnz %[append], 1f\n"
+                        "cbnz %[accumulate], 1f\n"
                         "mov z16.s, #0\n"
                         "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
                         "mov z17.s, #0\n"
@@ -2111,7 +2122,7 @@
                         ".unreq c_ptr2\n"
                         ".unreq c_ptr3\n"
                         : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
+                        : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
                         : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
                     );
                     break;

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp
index a3434c1..43107e4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,10 @@
 
     kern_type kernel=sve_interleaved_bf16fp32_dot_3VLx8;
 
-    interleaved_bf16fp32_dot_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+    interleaved_bf16fp32_dot_3VLx8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp
index 6584158..7e20ed0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,13 +61,11 @@
                 "mov z15.s, #0\n"
                 "ld1rqh z2.h, p0/z, [%[a_ptr], #0x20]\n"
                 "mov z16.s, #0\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "mov z17.s, #0\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n"
-                "mov z18.s, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z19.s, #0\n"
+                "mov z17.s, #0\n"
                 "addvl %[b_ptr], %[b_ptr], #3\n"
+                "mov z18.s, #0\n"
+                "mov z19.s, #0\n"
                 "mov z20.s, #0\n"
                 "mov z21.s, #0\n"
                 "mov z22.s, #0\n"
@@ -83,9 +81,11 @@
                 "cbz %[loops], 1f\n"
                 "2:\n"
                 ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
-                "subs %[loops], %[loops], #0x1\n"
+                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
+                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
+                "subs %[loops], %[loops], #0x1\n"
                 ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
                 ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
                 ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"
@@ -141,13 +141,13 @@
                 ".inst 0x646b40dd // bfdot z29.s, z6.h, z3.h[1]\n"
                 ".inst 0x647340de // bfdot z30.s, z6.h, z3.h[2]\n"
                 ".inst 0x647b40df // bfdot z31.s, z6.h, z3.h[3]\n"
-                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
                 "b.ne 2b\n"
                 "1:\n"
                 "cbz %[tails], 3f\n"
                 ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
+                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
+                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
                 ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
                 ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
@@ -235,9 +235,11 @@
                 "b 4f\n"
                 "3:\n"
                 ".inst 0x64604088 // bfdot z8.s, z4.h, z0.h[0]\n"
-                "addvl %[b_ptr], %[b_ptr], #3\n"
+                "ld1h z6.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x64684089 // bfdot z9.s, z4.h, z0.h[1]\n"
+                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6470408a // bfdot z10.s, z4.h, z0.h[2]\n"
+                "addvl %[b_ptr], %[b_ptr], #3\n"
                 ".inst 0x6478408b // bfdot z11.s, z4.h, z0.h[3]\n"
                 ".inst 0x64614094 // bfdot z20.s, z4.h, z1.h[0]\n"
                 ".inst 0x64694095 // bfdot z21.s, z4.h, z1.h[1]\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp
index c6ffc04..f1353e2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,10 @@
 
     kern_type kernel=sve_interleaved_bf16fp32_mmla_3VLx8;
 
-    interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+    interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp
index 528fc72..16cc69b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,13 +63,11 @@
                 "mov z16.s, #0\n"
                 "ld1h z6.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
                 "mov z17.s, #0\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #0x30]\n"
-                "mov z18.s, #0\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                "mov z19.s, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z20.s, #0\n"
+                "mov z18.s, #0\n"
                 "addvl %[b_ptr], %[b_ptr], #4\n"
+                "mov z19.s, #0\n"
+                "mov z20.s, #0\n"
                 "mov z21.s, #0\n"
                 "mov z22.s, #0\n"
                 "mov z23.s, #0\n"
@@ -84,12 +82,14 @@
                 "cbz %[loops], 1f\n"
                 "2:\n"
                 ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
-                "subs %[loops], %[loops], #0x1\n"
+                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
+                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
+                "subs %[loops], %[loops], #0x1\n"
+                ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
                 ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
                 "ld1h z4.h, p0/z, [%[b_ptr]]\n"
-                ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
                 ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
                 ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
                 ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
@@ -152,18 +152,18 @@
                 ".inst 0x6467e459 // bfmmla z25.s, z2.h, z7.h\n"
                 "ld1rqh z2.h, p0/z, [%[a_ptr], #-0x20]\n"
                 ".inst 0x6467e47f // bfmmla z31.s, z3.h, z7.h\n"
-                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
                 "b.ne 2b\n"
                 "1:\n"
                 "cbz %[tails], 3f\n"
                 ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
+                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
+                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
-                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
                 ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
                 ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
+                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
+                "ld1h z4.h, p0/z, [%[b_ptr]]\n"
                 ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
                 ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
                 "ld1h z5.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
@@ -269,15 +269,17 @@
                 "b 4f\n"
                 "3:\n"
                 ".inst 0x6464e408 // bfmmla z8.s, z0.h, z4.h\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "ld1h z7.h, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x6464e42e // bfmmla z14.s, z1.h, z4.h\n"
-                "addvl %[b_ptr], %[b_ptr], #8\n"
+                "ld1rqh z3.h, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x6464e454 // bfmmla z20.s, z2.h, z4.h\n"
-                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
                 ".inst 0x6465e409 // bfmmla z9.s, z0.h, z5.h\n"
-                "ld1h z4.h, p0/z, [%[b_ptr], #-8, MUL VL]\n"
+                "addvl %[b_ptr], %[b_ptr], #8\n"
+                ".inst 0x6464e47a // bfmmla z26.s, z3.h, z4.h\n"
                 ".inst 0x6465e42f // bfmmla z15.s, z1.h, z5.h\n"
                 ".inst 0x6465e455 // bfmmla z21.s, z2.h, z5.h\n"
+                "ld1h z4.h, p0/z, [%[b_ptr], #-8, MUL VL]\n"
                 ".inst 0x6465e47b // bfmmla z27.s, z3.h, z5.h\n"
                 "ld1h z5.h, p0/z, [%[b_ptr], #-7, MUL VL]\n"
                 ".inst 0x6466e40a // bfmmla z10.s, z0.h, z6.h\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
index 10dbdd8..816c0cd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,10 @@
 
     kern_type kernel=sve_interleaved_fp16_mla_3VLx8;
 
-    interleaved_fp16_mla_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+    interleaved_fp16_mla_3VLx8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
index b2d3a6f..f2050cb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,22 +50,22 @@
                 "mov z9.h, #0\n"
                 "mov z10.h, #0\n"
                 "mov z11.h, #0\n"
-                "mov z12.h, #0\n"
                 "ld1rqh z0.h, p0/z, [%[a_ptr]]\n"
-                "mov z13.h, #0\n"
+                "mov z12.h, #0\n"
                 "ld1h z2.h, p0/z, [%[b_ptr]]\n"
-                "mov z14.h, #0\n"
+                "mov z13.h, #0\n"
                 "ld1h z3.h, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z15.h, #0\n"
+                "mov z14.h, #0\n"
                 "ld1h z4.h, p0/z, [%[b_ptr], #2, MUL VL]\n"
-                "mov z16.h, #0\n"
+                "mov z15.h, #0\n"
                 "ld1h z5.h, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                "mov z17.h, #0\n"
+                "mov z16.h, #0\n"
                 "ld1h z6.h, p0/z, [%[b_ptr], #4, MUL VL]\n"
-                "mov z18.h, #0\n"
+                "mov z17.h, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x20\n"
-                "mov z19.h, #0\n"
+                "mov z18.h, #0\n"
                 "addvl %[b_ptr], %[b_ptr], #6\n"
+                "mov z19.h, #0\n"
                 "mov z20.h, #0\n"
                 "mov z21.h, #0\n"
                 "mov z22.h, #0\n"
@@ -202,8 +202,8 @@
                 "fmla z9.h, z2.h, z0.h[1]\n"
                 "fmla z10.h, z2.h, z0.h[2]\n"
                 "fmla z11.h, z2.h, z0.h[3]\n"
-                "fmla z12.h, z2.h, z0.h[4]\n"
                 "st1h z8.h, p0, [%[c_ptr]]\n"
+                "fmla z12.h, z2.h, z0.h[4]\n"
                 "fmla z13.h, z2.h, z0.h[5]\n"
                 "fmla z14.h, z2.h, z0.h[6]\n"
                 "fmla z15.h, z2.h, z0.h[7]\n"
@@ -211,8 +211,8 @@
                 "fmla z17.h, z3.h, z0.h[1]\n"
                 "fmla z18.h, z3.h, z0.h[2]\n"
                 "fmla z19.h, z3.h, z0.h[3]\n"
-                "fmla z20.h, z3.h, z0.h[4]\n"
                 "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
+                "fmla z20.h, z3.h, z0.h[4]\n"
                 "fmla z21.h, z3.h, z0.h[5]\n"
                 "fmla z22.h, z3.h, z0.h[6]\n"
                 "fmla z23.h, z3.h, z0.h[7]\n"
@@ -220,10 +220,11 @@
                 "fmla z25.h, z4.h, z0.h[1]\n"
                 "fmla z26.h, z4.h, z0.h[2]\n"
                 "fmla z27.h, z4.h, z0.h[3]\n"
-                "fmla z28.h, z4.h, z0.h[4]\n"
                 "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
+                "fmla z28.h, z4.h, z0.h[4]\n"
                 "fmla z29.h, z4.h, z0.h[5]\n"
                 "fmla z30.h, z4.h, z0.h[6]\n"
+                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
                 "fmla z31.h, z4.h, z0.h[7]\n"
                 "b 4f\n"
                 "3:\n"
@@ -257,8 +258,8 @@
                 "fmla z9.h, z5.h, z1.h[1]\n"
                 "fmla z10.h, z5.h, z1.h[2]\n"
                 "fmla z11.h, z5.h, z1.h[3]\n"
-                "fmla z12.h, z5.h, z1.h[4]\n"
                 "st1h z8.h, p0, [%[c_ptr]]\n"
+                "fmla z12.h, z5.h, z1.h[4]\n"
                 "fmla z13.h, z5.h, z1.h[5]\n"
                 "fmla z14.h, z5.h, z1.h[6]\n"
                 "fmla z15.h, z5.h, z1.h[7]\n"
@@ -266,8 +267,8 @@
                 "fmla z17.h, z6.h, z1.h[1]\n"
                 "fmla z18.h, z6.h, z1.h[2]\n"
                 "fmla z19.h, z6.h, z1.h[3]\n"
-                "fmla z20.h, z6.h, z1.h[4]\n"
                 "st1h z16.h, p0, [%[c_ptr], #1, MUL VL]\n"
+                "fmla z20.h, z6.h, z1.h[4]\n"
                 "fmla z21.h, z6.h, z1.h[5]\n"
                 "fmla z22.h, z6.h, z1.h[6]\n"
                 "fmla z23.h, z6.h, z1.h[7]\n"
@@ -275,13 +276,13 @@
                 "fmla z25.h, z7.h, z1.h[1]\n"
                 "fmla z26.h, z7.h, z1.h[2]\n"
                 "fmla z27.h, z7.h, z1.h[3]\n"
-                "fmla z28.h, z7.h, z1.h[4]\n"
                 "st1h z24.h, p0, [%[c_ptr], #2, MUL VL]\n"
+                "fmla z28.h, z7.h, z1.h[4]\n"
                 "fmla z29.h, z7.h, z1.h[5]\n"
                 "fmla z30.h, z7.h, z1.h[6]\n"
+                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
                 "fmla z31.h, z7.h, z1.h[7]\n"
                 "4:\n"
-                "st1h z9.h, p0, [%[c_ptr], #3, MUL VL]\n"
                 "st1h z17.h, p0, [%[c_ptr], #4, MUL VL]\n"
                 "st1h z25.h, p0, [%[c_ptr], #5, MUL VL]\n"
                 "st1h z10.h, p0, [%[c_ptr], #6, MUL VL]\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
index cdc9447..cce90fb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,10 @@
 
     kern_type kernel=sve_interleaved_fp32_mla_3VLx8;
 
-    interleaved_fp32_mla_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+    interleaved_fp32_mla_3VLx8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
index d26948a..cd178c4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,20 +50,20 @@
                 "mov z9.s, #0\n"
                 "mov z10.s, #0\n"
                 "mov z11.s, #0\n"
-                "mov z12.s, #0\n"
                 "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
-                "mov z13.s, #0\n"
+                "mov z12.s, #0\n"
                 "ld1w z4.s, p0/z, [%[b_ptr]]\n"
-                "mov z14.s, #0\n"
+                "mov z13.s, #0\n"
                 "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z15.s, #0\n"
+                "mov z14.s, #0\n"
                 "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z16.s, #0\n"
+                "mov z15.s, #0\n"
                 "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z17.s, #0\n"
+                "mov z16.s, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z18.s, #0\n"
+                "mov z17.s, #0\n"
                 "addvl %[b_ptr], %[b_ptr], #3\n"
+                "mov z18.s, #0\n"
                 "mov z19.s, #0\n"
                 "mov z20.s, #0\n"
                 "mov z21.s, #0\n"
@@ -207,8 +207,8 @@
                 "fmla z9.s, z4.s, z0.s[1]\n"
                 "fmla z10.s, z4.s, z0.s[2]\n"
                 "fmla z11.s, z4.s, z0.s[3]\n"
-                "fmla z20.s, z4.s, z1.s[0]\n"
                 "st1w z8.s, p0, [%[c_ptr]]\n"
+                "fmla z20.s, z4.s, z1.s[0]\n"
                 "fmla z21.s, z4.s, z1.s[1]\n"
                 "fmla z22.s, z4.s, z1.s[2]\n"
                 "fmla z23.s, z4.s, z1.s[3]\n"
@@ -216,8 +216,8 @@
                 "fmla z13.s, z5.s, z0.s[1]\n"
                 "fmla z14.s, z5.s, z0.s[2]\n"
                 "fmla z15.s, z5.s, z0.s[3]\n"
-                "fmla z24.s, z5.s, z1.s[0]\n"
                 "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "fmla z24.s, z5.s, z1.s[0]\n"
                 "fmla z25.s, z5.s, z1.s[1]\n"
                 "fmla z26.s, z5.s, z1.s[2]\n"
                 "fmla z27.s, z5.s, z1.s[3]\n"
@@ -225,10 +225,11 @@
                 "fmla z17.s, z6.s, z0.s[1]\n"
                 "fmla z18.s, z6.s, z0.s[2]\n"
                 "fmla z19.s, z6.s, z0.s[3]\n"
-                "fmla z28.s, z6.s, z1.s[0]\n"
                 "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "fmla z28.s, z6.s, z1.s[0]\n"
                 "fmla z29.s, z6.s, z1.s[1]\n"
                 "fmla z30.s, z6.s, z1.s[2]\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "fmla z31.s, z6.s, z1.s[3]\n"
                 "b 4f\n"
                 "3:\n"
@@ -266,8 +267,8 @@
                 "fmla z9.s, z4.s, z2.s[1]\n"
                 "fmla z10.s, z4.s, z2.s[2]\n"
                 "fmla z11.s, z4.s, z2.s[3]\n"
-                "fmla z20.s, z4.s, z3.s[0]\n"
                 "st1w z8.s, p0, [%[c_ptr]]\n"
+                "fmla z20.s, z4.s, z3.s[0]\n"
                 "fmla z21.s, z4.s, z3.s[1]\n"
                 "fmla z22.s, z4.s, z3.s[2]\n"
                 "fmla z23.s, z4.s, z3.s[3]\n"
@@ -275,8 +276,8 @@
                 "fmla z13.s, z5.s, z2.s[1]\n"
                 "fmla z14.s, z5.s, z2.s[2]\n"
                 "fmla z15.s, z5.s, z2.s[3]\n"
-                "fmla z24.s, z5.s, z3.s[0]\n"
                 "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "fmla z24.s, z5.s, z3.s[0]\n"
                 "fmla z25.s, z5.s, z3.s[1]\n"
                 "fmla z26.s, z5.s, z3.s[2]\n"
                 "fmla z27.s, z5.s, z3.s[3]\n"
@@ -284,13 +285,13 @@
                 "fmla z17.s, z6.s, z2.s[1]\n"
                 "fmla z18.s, z6.s, z2.s[2]\n"
                 "fmla z19.s, z6.s, z2.s[3]\n"
-                "fmla z28.s, z6.s, z3.s[0]\n"
                 "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "fmla z28.s, z6.s, z3.s[0]\n"
                 "fmla z29.s, z6.s, z3.s[1]\n"
                 "fmla z30.s, z6.s, z3.s[2]\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "fmla z31.s, z6.s, z3.s[3]\n"
                 "4:\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
                 "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
                 "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp
similarity index 65%
rename from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
rename to src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp
index 36f84d8..4ca43cd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,35 +23,50 @@
  */
 #pragma once
 
-#ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../std_transforms_sve.hpp"
 
 namespace arm_gemm {
 
 // Actual kernel implementations
-void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
+void sve_interleaved_fp32_mmla_3VLx8(const float *, const float *, float *, int, int, int);
 
-// Transposed SGEMV strategy class.
-class sgemv_trans {
+class interleaved_fp32_mmla_3VLx8 {
 public:
     typedef float operand_type;
     typedef float result_type;
 
-    typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
+    typedef void (*kern_type)(const float *, const float *, float *, int, int, int);
 
     /* Kernel blocking parameters */
-    static unsigned int out_width() {
-        return 96;
+    static unsigned int out_width()
+    {
+        return get_vector_length<float>() * 3;
     }
 
-    static unsigned int k_unroll() {
-        return 1;
+    static unsigned int out_height()
+    {
+        return 8;
     }
 
-    kern_type kernel=a64_sgemv_trans;
+    static unsigned int k_unroll()
+    {
+        return 2;
+    }
 
-    sgemv_trans(const CPUInfo *ci) { UNUSED(ci); }
+    // Use the standard fixed size transforms.
+    StdTransformsSVE<operand_type, result_type, 8, 6, 2, 2> transforms = {};
+
+    kern_type kernel=sve_interleaved_fp32_mmla_3VLx8;
+
+    interleaved_fp32_mmla_3VLx8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm
 
-#endif // __aarch64__
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp
new file mode 100644
index 0000000..a404ae9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp

@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+
+#include "../../asmlib.hpp"
+
+namespace arm_gemm {
+
+void sve_interleaved_fp32_mmla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+    const float *a_ptr = Apanel;
+    float *c_ptr = Cpanel;
+
+    K /= 2;
+    const long loops_count = (K / 2) - 1;
+    const long tails_count = K % 2;
+
+    for (int yb=0; yb<ablocks; yb++) {
+        const float *a_ptr0 = a_ptr;
+        const float *b_ptr = Bpanel;
+
+        for (int xb=0; xb<bblocks; xb++) {
+            a_ptr = a_ptr0;
+            long loops = loops_count;
+            long tails = tails_count;
+
+            __asm __volatile (
+                "mov z8.s, #0\n"
+                "ptrue p0.s\n"
+                "mov z9.s, #0\n"
+                "mov z10.s, #0\n"
+                "mov z11.s, #0\n"
+                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+                "mov z12.s, #0\n"
+                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+                "mov z13.s, #0\n"
+                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+                "mov z14.s, #0\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                "mov z15.s, #0\n"
+                "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+                "mov z16.s, #0\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                "mov z17.s, #0\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "mov z18.s, #0\n"
+                "addvl %[b_ptr], %[b_ptr], #4\n"
+                "mov z19.s, #0\n"
+                "mov z20.s, #0\n"
+                "mov z21.s, #0\n"
+                "mov z22.s, #0\n"
+                "mov z23.s, #0\n"
+                "mov z24.s, #0\n"
+                "mov z25.s, #0\n"
+                "mov z26.s, #0\n"
+                "mov z27.s, #0\n"
+                "mov z28.s, #0\n"
+                "mov z29.s, #0\n"
+                "mov z30.s, #0\n"
+                "mov z31.s, #0\n"
+                "cbz %[loops], 1f\n"
+                "2:\n"
+                ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n"
+                "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n"
+                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+                ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n"
+                "subs %[loops], %[loops], #0x1\n"
+                ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n"
+                ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n"
+                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+                ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n"
+                ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n"
+                ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n"
+                ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n"
+                ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n"
+                ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n"
+                ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n"
+                ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n"
+                ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n"
+                "ld1w z7.s, p0/z, [%[b_ptr], #3, MUL VL]\n"
+                ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n"
+                ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n"
+                ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n"
+                ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n"
+                "ld1w z4.s, p0/z, [%[b_ptr], #4, MUL VL]\n"
+                ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n"
+                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+                ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n"
+                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+                ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n"
+                "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+                ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #5, MUL VL]\n"
+                ".inst 0x64a6e408 // fmmla z8.s, z0.s, z6.s\n"
+                "ld1rqw z3.s, p0/z, [%[a_ptr], #0x30]\n"
+                ".inst 0x64a6e42e // fmmla z14.s, z1.s, z6.s\n"
+                "add %[a_ptr], %[a_ptr], #0x80\n"
+                ".inst 0x64a6e454 // fmmla z20.s, z2.s, z6.s\n"
+                "addvl %[b_ptr], %[b_ptr], #12\n"
+                ".inst 0x64a6e47a // fmmla z26.s, z3.s, z6.s\n"
+                ".inst 0x64a7e409 // fmmla z9.s, z0.s, z7.s\n"
+                ".inst 0x64a7e42f // fmmla z15.s, z1.s, z7.s\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-6, MUL VL]\n"
+                ".inst 0x64a7e455 // fmmla z21.s, z2.s, z7.s\n"
+                ".inst 0x64a7e47b // fmmla z27.s, z3.s, z7.s\n"
+                "ld1w z7.s, p0/z, [%[b_ptr], #-5, MUL VL]\n"
+                ".inst 0x64a4e40a // fmmla z10.s, z0.s, z4.s\n"
+                ".inst 0x64a4e430 // fmmla z16.s, z1.s, z4.s\n"
+                ".inst 0x64a4e456 // fmmla z22.s, z2.s, z4.s\n"
+                ".inst 0x64a4e47c // fmmla z28.s, z3.s, z4.s\n"
+                "ld1w z4.s, p0/z, [%[b_ptr], #-4, MUL VL]\n"
+                ".inst 0x64a5e40b // fmmla z11.s, z0.s, z5.s\n"
+                ".inst 0x64a5e431 // fmmla z17.s, z1.s, z5.s\n"
+                ".inst 0x64a5e457 // fmmla z23.s, z2.s, z5.s\n"
+                ".inst 0x64a5e47d // fmmla z29.s, z3.s, z5.s\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                ".inst 0x64a6e40c // fmmla z12.s, z0.s, z6.s\n"
+                ".inst 0x64a6e432 // fmmla z18.s, z1.s, z6.s\n"
+                ".inst 0x64a6e458 // fmmla z24.s, z2.s, z6.s\n"
+                ".inst 0x64a6e47e // fmmla z30.s, z3.s, z6.s\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                ".inst 0x64a7e40d // fmmla z13.s, z0.s, z7.s\n"
+                "ld1rqw z0.s, p0/z, [%[a_ptr], #-0x40]\n"
+                ".inst 0x64a7e433 // fmmla z19.s, z1.s, z7.s\n"
+                "ld1rqw z1.s, p0/z, [%[a_ptr], #-0x30]\n"
+                ".inst 0x64a7e459 // fmmla z25.s, z2.s, z7.s\n"
+                "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n"
+                ".inst 0x64a7e47f // fmmla z31.s, z3.s, z7.s\n"
+                "b.ne 2b\n"
+                "1:\n"
+                "cbz %[tails], 3f\n"
+                ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n"
+                "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n"
+                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+                ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n"
+                ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n"
+                ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n"
+                ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n"
+                "ld1w z4.s, p0/z, [%[b_ptr]]\n"
+                ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n"
+                ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #1, MUL VL]\n"
+                ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n"
+                ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n"
+                ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n"
+                ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #2, MUL VL]\n"
+                ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n"
+                ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n"
+                ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n"
+                ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n"
+                "ld1w z7.s, p0/z, [%[b_ptr], #3, MUL VL]\n"
+                ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n"
+                ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n"
+                ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n"
+                ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n"
+                "ld1w z4.s, p0/z, [%[b_ptr], #4, MUL VL]\n"
+                ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n"
+                "ld1rqw z0.s, p0/z, [%[a_ptr]]\n"
+                ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n"
+                "ld1rqw z1.s, p0/z, [%[a_ptr], #0x10]\n"
+                ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n"
+                "ld1rqw z2.s, p0/z, [%[a_ptr], #0x20]\n"
+                ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #5, MUL VL]\n"
+                ".inst 0x64a6e408 // fmmla z8.s, z0.s, z6.s\n"
+                "ld1rqw z3.s, p0/z, [%[a_ptr], #0x30]\n"
+                ".inst 0x64a6e42e // fmmla z14.s, z1.s, z6.s\n"
+                "add %[a_ptr], %[a_ptr], #0x80\n"
+                ".inst 0x64a6e454 // fmmla z20.s, z2.s, z6.s\n"
+                "addvl %[b_ptr], %[b_ptr], #14\n"
+                ".inst 0x64a6e47a // fmmla z26.s, z3.s, z6.s\n"
+                ".inst 0x64a7e409 // fmmla z9.s, z0.s, z7.s\n"
+                ".inst 0x64a7e42f // fmmla z15.s, z1.s, z7.s\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-8, MUL VL]\n"
+                ".inst 0x64a7e455 // fmmla z21.s, z2.s, z7.s\n"
+                ".inst 0x64a7e47b // fmmla z27.s, z3.s, z7.s\n"
+                "ld1w z7.s, p0/z, [%[b_ptr], #-7, MUL VL]\n"
+                ".inst 0x64a4e40a // fmmla z10.s, z0.s, z4.s\n"
+                ".inst 0x64a4e430 // fmmla z16.s, z1.s, z4.s\n"
+                ".inst 0x64a4e456 // fmmla z22.s, z2.s, z4.s\n"
+                ".inst 0x64a4e47c // fmmla z28.s, z3.s, z4.s\n"
+                "ld1w z4.s, p0/z, [%[b_ptr], #-6, MUL VL]\n"
+                ".inst 0x64a5e40b // fmmla z11.s, z0.s, z5.s\n"
+                ".inst 0x64a5e431 // fmmla z17.s, z1.s, z5.s\n"
+                ".inst 0x64a5e457 // fmmla z23.s, z2.s, z5.s\n"
+                ".inst 0x64a5e47d // fmmla z29.s, z3.s, z5.s\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #-5, MUL VL]\n"
+                ".inst 0x64a6e40c // fmmla z12.s, z0.s, z6.s\n"
+                ".inst 0x64a6e432 // fmmla z18.s, z1.s, z6.s\n"
+                ".inst 0x64a6e458 // fmmla z24.s, z2.s, z6.s\n"
+                ".inst 0x64a6e47e // fmmla z30.s, z3.s, z6.s\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-4, MUL VL]\n"
+                ".inst 0x64a7e40d // fmmla z13.s, z0.s, z7.s\n"
+                "ld1rqw z0.s, p0/z, [%[a_ptr], #-0x40]\n"
+                ".inst 0x64a7e433 // fmmla z19.s, z1.s, z7.s\n"
+                "ld1rqw z1.s, p0/z, [%[a_ptr], #-0x30]\n"
+                ".inst 0x64a7e459 // fmmla z25.s, z2.s, z7.s\n"
+                "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n"
+                ".inst 0x64a7e47f // fmmla z31.s, z3.s, z7.s\n"
+                "ld1w z7.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n"
+                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+                ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n"
+                ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n"
+                ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n"
+                ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n"
+                "ld1w z4.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n"
+                ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n"
+                ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n"
+                ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n"
+                ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n"
+                ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n"
+                "uzp1 z6.d, z14.d, z15.d\n"
+                ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n"
+                ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n"
+                ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n"
+                ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n"
+                ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n"
+                "uzp1 z7.d, z16.d, z17.d\n"
+                ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n"
+                ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n"
+                ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n"
+                "uzp2 z4.d, z10.d, z11.d\n"
+                ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n"
+                "uzp1 z0.d, z8.d, z9.d\n"
+                ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n"
+                "uzp1 z1.d, z10.d, z11.d\n"
+                ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n"
+                "st1w z0.s, p0, [%[c_ptr]]\n"
+                "uzp1 z2.d, z12.d, z13.d\n"
+                "uzp1 z0.d, z18.d, z19.d\n"
+                ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n"
+                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "uzp2 z3.d, z8.d, z9.d\n"
+                "uzp2 z5.d, z12.d, z13.d\n"
+                "uzp2 z1.d, z14.d, z15.d\n"
+                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "b 4f\n"
+                "3:\n"
+                ".inst 0x64a4e408 // fmmla z8.s, z0.s, z4.s\n"
+                "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                ".inst 0x64a4e42e // fmmla z14.s, z1.s, z4.s\n"
+                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+                ".inst 0x64a4e454 // fmmla z20.s, z2.s, z4.s\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
+                ".inst 0x64a5e409 // fmmla z9.s, z0.s, z5.s\n"
+                "addvl %[b_ptr], %[b_ptr], #8\n"
+                ".inst 0x64a4e47a // fmmla z26.s, z3.s, z4.s\n"
+                ".inst 0x64a5e42f // fmmla z15.s, z1.s, z5.s\n"
+                ".inst 0x64a5e455 // fmmla z21.s, z2.s, z5.s\n"
+                "ld1w z4.s, p0/z, [%[b_ptr], #-8, MUL VL]\n"
+                ".inst 0x64a5e47b // fmmla z27.s, z3.s, z5.s\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #-7, MUL VL]\n"
+                ".inst 0x64a6e40a // fmmla z10.s, z0.s, z6.s\n"
+                ".inst 0x64a6e430 // fmmla z16.s, z1.s, z6.s\n"
+                ".inst 0x64a6e456 // fmmla z22.s, z2.s, z6.s\n"
+                ".inst 0x64a6e47c // fmmla z28.s, z3.s, z6.s\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-6, MUL VL]\n"
+                ".inst 0x64a7e40b // fmmla z11.s, z0.s, z7.s\n"
+                ".inst 0x64a7e431 // fmmla z17.s, z1.s, z7.s\n"
+                ".inst 0x64a7e457 // fmmla z23.s, z2.s, z7.s\n"
+                ".inst 0x64a7e47d // fmmla z29.s, z3.s, z7.s\n"
+                "ld1w z7.s, p0/z, [%[b_ptr], #-5, MUL VL]\n"
+                ".inst 0x64a4e40c // fmmla z12.s, z0.s, z4.s\n"
+                ".inst 0x64a4e432 // fmmla z18.s, z1.s, z4.s\n"
+                ".inst 0x64a4e458 // fmmla z24.s, z2.s, z4.s\n"
+                ".inst 0x64a4e47e // fmmla z30.s, z3.s, z4.s\n"
+                "ld1w z4.s, p0/z, [%[b_ptr], #-4, MUL VL]\n"
+                ".inst 0x64a5e40d // fmmla z13.s, z0.s, z5.s\n"
+                "ld1rqw z0.s, p0/z, [%[a_ptr], #-0x40]\n"
+                ".inst 0x64a5e433 // fmmla z19.s, z1.s, z5.s\n"
+                "ld1rqw z1.s, p0/z, [%[a_ptr], #-0x30]\n"
+                ".inst 0x64a5e459 // fmmla z25.s, z2.s, z5.s\n"
+                "ld1rqw z2.s, p0/z, [%[a_ptr], #-0x20]\n"
+                ".inst 0x64a5e47f // fmmla z31.s, z3.s, z5.s\n"
+                "ld1w z5.s, p0/z, [%[b_ptr], #-3, MUL VL]\n"
+                ".inst 0x64a6e408 // fmmla z8.s, z0.s, z6.s\n"
+                "ld1rqw z3.s, p0/z, [%[a_ptr], #-0x10]\n"
+                ".inst 0x64a6e42e // fmmla z14.s, z1.s, z6.s\n"
+                ".inst 0x64a6e454 // fmmla z20.s, z2.s, z6.s\n"
+                ".inst 0x64a7e409 // fmmla z9.s, z0.s, z7.s\n"
+                ".inst 0x64a6e47a // fmmla z26.s, z3.s, z6.s\n"
+                "ld1w z6.s, p0/z, [%[b_ptr], #-2, MUL VL]\n"
+                ".inst 0x64a7e42f // fmmla z15.s, z1.s, z7.s\n"
+                ".inst 0x64a7e455 // fmmla z21.s, z2.s, z7.s\n"
+                ".inst 0x64a7e47b // fmmla z27.s, z3.s, z7.s\n"
+                "ld1w z7.s, p0/z, [%[b_ptr], #-1, MUL VL]\n"
+                ".inst 0x64a4e40a // fmmla z10.s, z0.s, z4.s\n"
+                ".inst 0x64a4e430 // fmmla z16.s, z1.s, z4.s\n"
+                ".inst 0x64a4e456 // fmmla z22.s, z2.s, z4.s\n"
+                ".inst 0x64a4e47c // fmmla z28.s, z3.s, z4.s\n"
+                ".inst 0x64a5e40b // fmmla z11.s, z0.s, z5.s\n"
+                ".inst 0x64a5e431 // fmmla z17.s, z1.s, z5.s\n"
+                ".inst 0x64a5e457 // fmmla z23.s, z2.s, z5.s\n"
+                ".inst 0x64a5e47d // fmmla z29.s, z3.s, z5.s\n"
+                "uzp2 z4.d, z10.d, z11.d\n"
+                ".inst 0x64a6e40c // fmmla z12.s, z0.s, z6.s\n"
+                ".inst 0x64a6e432 // fmmla z18.s, z1.s, z6.s\n"
+                ".inst 0x64a6e458 // fmmla z24.s, z2.s, z6.s\n"
+                ".inst 0x64a6e47e // fmmla z30.s, z3.s, z6.s\n"
+                "uzp1 z6.d, z14.d, z15.d\n"
+                ".inst 0x64a7e40d // fmmla z13.s, z0.s, z7.s\n"
+                "uzp1 z0.d, z8.d, z9.d\n"
+                ".inst 0x64a7e433 // fmmla z19.s, z1.s, z7.s\n"
+                "uzp1 z1.d, z10.d, z11.d\n"
+                "uzp2 z5.d, z12.d, z13.d\n"
+                "st1w z0.s, p0, [%[c_ptr]]\n"
+                ".inst 0x64a7e459 // fmmla z25.s, z2.s, z7.s\n"
+                "uzp1 z2.d, z12.d, z13.d\n"
+                "uzp1 z0.d, z18.d, z19.d\n"
+                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "uzp2 z1.d, z14.d, z15.d\n"
+                ".inst 0x64a7e47f // fmmla z31.s, z3.s, z7.s\n"
+                "uzp2 z3.d, z8.d, z9.d\n"
+                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "uzp1 z7.d, z16.d, z17.d\n"
+                "4:\n"
+                "uzp2 z2.d, z16.d, z17.d\n"
+                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "uzp2 z3.d, z18.d, z19.d\n"
+                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "uzp1 z4.d, z20.d, z21.d\n"
+                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "uzp1 z5.d, z22.d, z23.d\n"
+                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "uzp1 z6.d, z24.d, z25.d\n"
+                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "addvl %[c_ptr], %[c_ptr], #16\n"
+                "uzp2 z7.d, z20.d, z21.d\n"
+                "st1w z0.s, p0, [%[c_ptr], #-8, MUL VL]\n"
+                "uzp2 z0.d, z22.d, z23.d\n"
+                "st1w z1.s, p0, [%[c_ptr], #-7, MUL VL]\n"
+                "uzp2 z1.d, z24.d, z25.d\n"
+                "st1w z2.s, p0, [%[c_ptr], #-6, MUL VL]\n"
+                "uzp1 z2.d, z26.d, z27.d\n"
+                "st1w z3.s, p0, [%[c_ptr], #-5, MUL VL]\n"
+                "uzp1 z3.d, z28.d, z29.d\n"
+                "st1w z4.s, p0, [%[c_ptr], #-4, MUL VL]\n"
+                "uzp1 z4.d, z30.d, z31.d\n"
+                "st1w z5.s, p0, [%[c_ptr], #-3, MUL VL]\n"
+                "uzp2 z5.d, z26.d, z27.d\n"
+                "st1w z6.s, p0, [%[c_ptr], #-2, MUL VL]\n"
+                "uzp2 z6.d, z28.d, z29.d\n"
+                "st1w z7.s, p0, [%[c_ptr], #-1, MUL VL]\n"
+                "uzp2 z7.d, z30.d, z31.d\n"
+                "st1w z0.s, p0, [%[c_ptr]]\n"
+                "st1w z1.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "st1w z2.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "st1w z3.s, p0, [%[c_ptr], #3, MUL VL]\n"
+                "st1w z4.s, p0, [%[c_ptr], #4, MUL VL]\n"
+                "st1w z5.s, p0, [%[c_ptr], #5, MUL VL]\n"
+                "st1w z6.s, p0, [%[c_ptr], #6, MUL VL]\n"
+                "st1w z7.s, p0, [%[c_ptr], #7, MUL VL]\n"
+                "addvl %[c_ptr], %[c_ptr], #8\n"
+            : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
+              [loops] "+r" (loops), [tails] "+r" (tails)
+            :
+            : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
+            );
+        }
+    }
+}
+
+} // namespace arm_gemm
+
+#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
index effdbc6..e40ba21 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,10 @@
 
     kern_type kernel=sve_interleaved_s8s32_dot_3VLx8;
 
-    interleaved_s8s32_dot_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+    interleaved_s8s32_dot_3VLx8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
index 7640fca..cdc7070 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,20 +51,20 @@
                 "mov z9.s, #0\n"
                 "mov z10.s, #0\n"
                 "mov z11.s, #0\n"
-                "mov z12.s, #0\n"
                 "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "mov z13.s, #0\n"
+                "mov z12.s, #0\n"
                 "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "mov z14.s, #0\n"
+                "mov z13.s, #0\n"
                 "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z15.s, #0\n"
+                "mov z14.s, #0\n"
                 "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z16.s, #0\n"
+                "mov z15.s, #0\n"
                 "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z17.s, #0\n"
+                "mov z16.s, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z18.s, #0\n"
+                "mov z17.s, #0\n"
                 "addvl %[b_ptr], %[b_ptr], #3\n"
+                "mov z18.s, #0\n"
                 "mov z19.s, #0\n"
                 "mov z20.s, #0\n"
                 "mov z21.s, #0\n"
@@ -208,8 +208,8 @@
                 "sdot z9.s, z4.b, z0.b[1]\n"
                 "sdot z10.s, z4.b, z0.b[2]\n"
                 "sdot z11.s, z4.b, z0.b[3]\n"
-                "sdot z20.s, z4.b, z1.b[0]\n"
                 "st1w z8.s, p0, [%[c_ptr]]\n"
+                "sdot z20.s, z4.b, z1.b[0]\n"
                 "sdot z21.s, z4.b, z1.b[1]\n"
                 "sdot z22.s, z4.b, z1.b[2]\n"
                 "sdot z23.s, z4.b, z1.b[3]\n"
@@ -217,8 +217,8 @@
                 "sdot z13.s, z5.b, z0.b[1]\n"
                 "sdot z14.s, z5.b, z0.b[2]\n"
                 "sdot z15.s, z5.b, z0.b[3]\n"
-                "sdot z24.s, z5.b, z1.b[0]\n"
                 "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "sdot z24.s, z5.b, z1.b[0]\n"
                 "sdot z25.s, z5.b, z1.b[1]\n"
                 "sdot z26.s, z5.b, z1.b[2]\n"
                 "sdot z27.s, z5.b, z1.b[3]\n"
@@ -226,10 +226,11 @@
                 "sdot z17.s, z6.b, z0.b[1]\n"
                 "sdot z18.s, z6.b, z0.b[2]\n"
                 "sdot z19.s, z6.b, z0.b[3]\n"
-                "sdot z28.s, z6.b, z1.b[0]\n"
                 "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "sdot z28.s, z6.b, z1.b[0]\n"
                 "sdot z29.s, z6.b, z1.b[1]\n"
                 "sdot z30.s, z6.b, z1.b[2]\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "sdot z31.s, z6.b, z1.b[3]\n"
                 "b 4f\n"
                 "3:\n"
@@ -267,8 +268,8 @@
                 "sdot z9.s, z4.b, z2.b[1]\n"
                 "sdot z10.s, z4.b, z2.b[2]\n"
                 "sdot z11.s, z4.b, z2.b[3]\n"
-                "sdot z20.s, z4.b, z3.b[0]\n"
                 "st1w z8.s, p0, [%[c_ptr]]\n"
+                "sdot z20.s, z4.b, z3.b[0]\n"
                 "sdot z21.s, z4.b, z3.b[1]\n"
                 "sdot z22.s, z4.b, z3.b[2]\n"
                 "sdot z23.s, z4.b, z3.b[3]\n"
@@ -276,8 +277,8 @@
                 "sdot z13.s, z5.b, z2.b[1]\n"
                 "sdot z14.s, z5.b, z2.b[2]\n"
                 "sdot z15.s, z5.b, z2.b[3]\n"
-                "sdot z24.s, z5.b, z3.b[0]\n"
                 "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "sdot z24.s, z5.b, z3.b[0]\n"
                 "sdot z25.s, z5.b, z3.b[1]\n"
                 "sdot z26.s, z5.b, z3.b[2]\n"
                 "sdot z27.s, z5.b, z3.b[3]\n"
@@ -285,13 +286,13 @@
                 "sdot z17.s, z6.b, z2.b[1]\n"
                 "sdot z18.s, z6.b, z2.b[2]\n"
                 "sdot z19.s, z6.b, z2.b[3]\n"
-                "sdot z28.s, z6.b, z3.b[0]\n"
                 "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "sdot z28.s, z6.b, z3.b[0]\n"
                 "sdot z29.s, z6.b, z3.b[1]\n"
                 "sdot z30.s, z6.b, z3.b[2]\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "sdot z31.s, z6.b, z3.b[3]\n"
                 "4:\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
                 "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
                 "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp
index cd50d0d..361598d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,9 +61,9 @@
 
     kern_type kernel=sve_interleaved_s8s32_mmla_3VLx8;
 
-    interleaved_s8s32_mmla_3VLx8(const CPUInfo *ci)
+    interleaved_s8s32_mmla_3VLx8(const CPUInfo *)
     {
-        UNUSED(ci);
+
     }
 };
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp
index d636c9d..cde9ec3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,13 +63,11 @@
                 "mov z16.s, #0\n"
                 "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
                 "mov z17.s, #0\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
-                "mov z18.s, #0\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                "mov z19.s, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z20.s, #0\n"
+                "mov z18.s, #0\n"
                 "addvl %[b_ptr], %[b_ptr], #4\n"
+                "mov z19.s, #0\n"
+                "mov z20.s, #0\n"
                 "mov z21.s, #0\n"
                 "mov z22.s, #0\n"
                 "mov z23.s, #0\n"
@@ -84,12 +82,14 @@
                 "cbz %[loops], 1f\n"
                 "2:\n"
                 ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
-                "subs %[loops], %[loops], #0x1\n"
+                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
+                "subs %[loops], %[loops], #0x1\n"
+                ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
                 ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
                 "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
                 ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
                 ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
                 ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
@@ -152,18 +152,18 @@
                 ".inst 0x45079859 // smmla z25.s, z2.b, z7.b\n"
                 "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
                 ".inst 0x4507987f // smmla z31.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
                 "b.ne 2b\n"
                 "1:\n"
                 "cbz %[tails], 3f\n"
                 ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
+                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
-                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
                 ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
                 ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
+                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
+                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
                 ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
                 ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
                 "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
@@ -269,15 +269,17 @@
                 "b 4f\n"
                 "3:\n"
                 ".inst 0x45049808 // smmla z8.s, z0.b, z4.b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x4504982e // smmla z14.s, z1.b, z4.b\n"
-                "addvl %[b_ptr], %[b_ptr], #8\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x45049854 // smmla z20.s, z2.b, z4.b\n"
-                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
                 ".inst 0x45059809 // smmla z9.s, z0.b, z5.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
+                "addvl %[b_ptr], %[b_ptr], #8\n"
+                ".inst 0x4504987a // smmla z26.s, z3.b, z4.b\n"
                 ".inst 0x4505982f // smmla z15.s, z1.b, z5.b\n"
                 ".inst 0x45059855 // smmla z21.s, z2.b, z5.b\n"
+                "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
                 ".inst 0x4505987b // smmla z27.s, z3.b, z5.b\n"
                 "ld1b z5.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
                 ".inst 0x4506980a // smmla z10.s, z0.b, z6.b\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
index d3c8851..252f38e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,10 @@
 
     kern_type kernel=sve_interleaved_u8u32_dot_3VLx8;
 
-    interleaved_u8u32_dot_3VLx8(const CPUInfo *ci) { UNUSED(ci); }
+    interleaved_u8u32_dot_3VLx8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
index f4d33a9..6626f84 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,20 +51,20 @@
                 "mov z9.s, #0\n"
                 "mov z10.s, #0\n"
                 "mov z11.s, #0\n"
-                "mov z12.s, #0\n"
                 "ld1rqb z0.b, p0/z, [%[a_ptr]]\n"
-                "mov z13.s, #0\n"
+                "mov z12.s, #0\n"
                 "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                "mov z14.s, #0\n"
+                "mov z13.s, #0\n"
                 "ld1rqb z1.b, p0/z, [%[a_ptr], #0x10]\n"
-                "mov z15.s, #0\n"
+                "mov z14.s, #0\n"
                 "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
-                "mov z16.s, #0\n"
+                "mov z15.s, #0\n"
                 "ld1rqb z2.b, p0/z, [%[a_ptr], #0x20]\n"
-                "mov z17.s, #0\n"
+                "mov z16.s, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z18.s, #0\n"
+                "mov z17.s, #0\n"
                 "addvl %[b_ptr], %[b_ptr], #3\n"
+                "mov z18.s, #0\n"
                 "mov z19.s, #0\n"
                 "mov z20.s, #0\n"
                 "mov z21.s, #0\n"
@@ -208,8 +208,8 @@
                 "udot z9.s, z4.b, z0.b[1]\n"
                 "udot z10.s, z4.b, z0.b[2]\n"
                 "udot z11.s, z4.b, z0.b[3]\n"
-                "udot z20.s, z4.b, z1.b[0]\n"
                 "st1w z8.s, p0, [%[c_ptr]]\n"
+                "udot z20.s, z4.b, z1.b[0]\n"
                 "udot z21.s, z4.b, z1.b[1]\n"
                 "udot z22.s, z4.b, z1.b[2]\n"
                 "udot z23.s, z4.b, z1.b[3]\n"
@@ -217,8 +217,8 @@
                 "udot z13.s, z5.b, z0.b[1]\n"
                 "udot z14.s, z5.b, z0.b[2]\n"
                 "udot z15.s, z5.b, z0.b[3]\n"
-                "udot z24.s, z5.b, z1.b[0]\n"
                 "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "udot z24.s, z5.b, z1.b[0]\n"
                 "udot z25.s, z5.b, z1.b[1]\n"
                 "udot z26.s, z5.b, z1.b[2]\n"
                 "udot z27.s, z5.b, z1.b[3]\n"
@@ -226,10 +226,11 @@
                 "udot z17.s, z6.b, z0.b[1]\n"
                 "udot z18.s, z6.b, z0.b[2]\n"
                 "udot z19.s, z6.b, z0.b[3]\n"
-                "udot z28.s, z6.b, z1.b[0]\n"
                 "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "udot z28.s, z6.b, z1.b[0]\n"
                 "udot z29.s, z6.b, z1.b[1]\n"
                 "udot z30.s, z6.b, z1.b[2]\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "udot z31.s, z6.b, z1.b[3]\n"
                 "b 4f\n"
                 "3:\n"
@@ -267,8 +268,8 @@
                 "udot z9.s, z4.b, z2.b[1]\n"
                 "udot z10.s, z4.b, z2.b[2]\n"
                 "udot z11.s, z4.b, z2.b[3]\n"
-                "udot z20.s, z4.b, z3.b[0]\n"
                 "st1w z8.s, p0, [%[c_ptr]]\n"
+                "udot z20.s, z4.b, z3.b[0]\n"
                 "udot z21.s, z4.b, z3.b[1]\n"
                 "udot z22.s, z4.b, z3.b[2]\n"
                 "udot z23.s, z4.b, z3.b[3]\n"
@@ -276,8 +277,8 @@
                 "udot z13.s, z5.b, z2.b[1]\n"
                 "udot z14.s, z5.b, z2.b[2]\n"
                 "udot z15.s, z5.b, z2.b[3]\n"
-                "udot z24.s, z5.b, z3.b[0]\n"
                 "st1w z12.s, p0, [%[c_ptr], #1, MUL VL]\n"
+                "udot z24.s, z5.b, z3.b[0]\n"
                 "udot z25.s, z5.b, z3.b[1]\n"
                 "udot z26.s, z5.b, z3.b[2]\n"
                 "udot z27.s, z5.b, z3.b[3]\n"
@@ -285,13 +286,13 @@
                 "udot z17.s, z6.b, z2.b[1]\n"
                 "udot z18.s, z6.b, z2.b[2]\n"
                 "udot z19.s, z6.b, z2.b[3]\n"
-                "udot z28.s, z6.b, z3.b[0]\n"
                 "st1w z16.s, p0, [%[c_ptr], #2, MUL VL]\n"
+                "udot z28.s, z6.b, z3.b[0]\n"
                 "udot z29.s, z6.b, z3.b[1]\n"
                 "udot z30.s, z6.b, z3.b[2]\n"
+                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "udot z31.s, z6.b, z3.b[3]\n"
                 "4:\n"
-                "st1w z9.s, p0, [%[c_ptr], #3, MUL VL]\n"
                 "st1w z13.s, p0, [%[c_ptr], #4, MUL VL]\n"
                 "st1w z17.s, p0, [%[c_ptr], #5, MUL VL]\n"
                 "st1w z10.s, p0, [%[c_ptr], #6, MUL VL]\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp
index 9b5ca10..ed44a9d 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,9 +61,9 @@
 
     kern_type kernel=sve_interleaved_u8u32_mmla_3VLx8;
 
-    interleaved_u8u32_mmla_3VLx8(const CPUInfo *ci)
+    interleaved_u8u32_mmla_3VLx8(const CPUInfo *)
     {
-        UNUSED(ci);
+
     }
 };
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp
index 15cc8fb..81a1dbc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,13 +63,11 @@
                 "mov z16.s, #0\n"
                 "ld1b z6.b, p0/z, [%[b_ptr], #2, MUL VL]\n"
                 "mov z17.s, #0\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #0x30]\n"
-                "mov z18.s, #0\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #3, MUL VL]\n"
-                "mov z19.s, #0\n"
                 "add %[a_ptr], %[a_ptr], #0x40\n"
-                "mov z20.s, #0\n"
+                "mov z18.s, #0\n"
                 "addvl %[b_ptr], %[b_ptr], #4\n"
+                "mov z19.s, #0\n"
+                "mov z20.s, #0\n"
                 "mov z21.s, #0\n"
                 "mov z22.s, #0\n"
                 "mov z23.s, #0\n"
@@ -84,12 +82,14 @@
                 "cbz %[loops], 1f\n"
                 "2:\n"
                 ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
-                "subs %[loops], %[loops], #0x1\n"
+                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
+                "subs %[loops], %[loops], #0x1\n"
+                ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
                 ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
                 "ld1b z4.b, p0/z, [%[b_ptr]]\n"
-                ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
                 ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
                 ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
                 ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
@@ -152,18 +152,18 @@
                 ".inst 0x45c79859 // ummla z25.s, z2.b, z7.b\n"
                 "ld1rqb z2.b, p0/z, [%[a_ptr], #-0x20]\n"
                 ".inst 0x45c7987f // ummla z31.s, z3.b, z7.b\n"
-                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
-                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
                 "b.ne 2b\n"
                 "1:\n"
                 "cbz %[tails], 3f\n"
                 ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
+                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
-                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
                 ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
                 ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
+                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
+                "ld1b z4.b, p0/z, [%[b_ptr]]\n"
                 ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
                 ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
                 "ld1b z5.b, p0/z, [%[b_ptr], #1, MUL VL]\n"
@@ -269,15 +269,17 @@
                 "b 4f\n"
                 "3:\n"
                 ".inst 0x45c49808 // ummla z8.s, z0.b, z4.b\n"
-                "add %[a_ptr], %[a_ptr], #0x40\n"
+                "ld1b z7.b, p0/z, [%[b_ptr], #-1, MUL VL]\n"
                 ".inst 0x45c4982e // ummla z14.s, z1.b, z4.b\n"
-                "addvl %[b_ptr], %[b_ptr], #8\n"
+                "ld1rqb z3.b, p0/z, [%[a_ptr], #-0x10]\n"
                 ".inst 0x45c49854 // ummla z20.s, z2.b, z4.b\n"
-                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
+                "add %[a_ptr], %[a_ptr], #0x40\n"
                 ".inst 0x45c59809 // ummla z9.s, z0.b, z5.b\n"
-                "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
+                "addvl %[b_ptr], %[b_ptr], #8\n"
+                ".inst 0x45c4987a // ummla z26.s, z3.b, z4.b\n"
                 ".inst 0x45c5982f // ummla z15.s, z1.b, z5.b\n"
                 ".inst 0x45c59855 // ummla z21.s, z2.b, z5.b\n"
+                "ld1b z4.b, p0/z, [%[b_ptr], #-8, MUL VL]\n"
                 ".inst 0x45c5987b // ummla z27.s, z3.b, z5.b\n"
                 "ld1b z5.b, p0/z, [%[b_ptr], #-7, MUL VL]\n"
                 ".inst 0x45c6980a // ummla z10.s, z0.b, z6.b\n"

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp
deleted file mode 100644
index 59103d2..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4.hpp
+++ /dev/null

@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *, int, const bfloat16 *, int ldb, float *, int, int, int, int, const float *, Activation, bool);
-
-class native_bf16fp32_dot_4VLx4
-{
-public:
-    typedef bfloat16 operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, int ldb, float *, int, int, int, int, const float *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<float>() * 4;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 2;
-    }
-
-    static constexpr bool supports_append()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-
-
-    // Default to the generic kernel
-    kern_type kernel=sve_native_bf16fp32_dot_4VLx4;
-
-    native_bf16fp32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp
deleted file mode 100644
index ce1971b..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp
+++ /dev/null

@@ -1,3275 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_native_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, int ldb, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    const long blocks_count = K / 2;
-    const long odds_count = K - (blocks_count * 2);
-    float nullbias[256];
-    if (!append && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    for (int y=0; y<M; y+=4) {
-        const bfloat16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(bfloat16);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const bfloat16 *a_ptr0 = a_ptr0_base;
-            const bfloat16 *b_ptr0 = B + x0;
-            const bfloat16 *b_ptr1 = b_ptr0 + ldb;
-            long ldbb = ldb * sizeof(bfloat16) * 2;
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(M-y) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "whilelt p4.h, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "whilelt p5.h, %[temp], %[width]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "cbz %[regs], 3f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "8:\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "9:\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "10:\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "11:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "cbz %[blocks], 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        "b.eq 13f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        "b.eq 14f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "15:\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        "b 7f\n"
-                        "14:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "16:\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        "b 7f\n"
-                        "13:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "17:\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        "b 7f\n"
-                        "12:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "18:\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        "7:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "whilelt p4.h, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "mov z21.d, z17.d\n"
-                        "whilelt p5.h, %[temp], %[width]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z23.d, z19.d\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "cbz %[regs], 3f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "8:\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "9:\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "10:\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "11:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "cbz %[blocks], 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        "b.eq 13f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        "b.eq 14f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "15:\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        "b 7f\n"
-                        "14:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "16:\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        "b 7f\n"
-                        "13:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "17:\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        "b 7f\n"
-                        "12:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "18:\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        "7:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "whilelt p4.h, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "mov z25.d, z17.d\n"
-                        "whilelt p5.h, %[temp], %[width]\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "mov z22.d, z18.d\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "mov z26.d, z18.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "cbz %[regs], 3f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "8:\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "9:\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "10:\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "11:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "cbz %[blocks], 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "b.eq 13f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "b.eq 14f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "15:\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        "b 7f\n"
-                        "14:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "16:\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        "b 7f\n"
-                        "13:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "17:\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "b 7f\n"
-                        "12:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "18:\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "7:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "whilelt p4.h, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z28.d, z16.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "mov z25.d, z17.d\n"
-                        "whilelt p5.h, %[temp], %[width]\n"
-                        "mov z29.d, z17.d\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "mov z26.d, z18.d\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "mov z30.d, z18.d\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "mov z23.d, z19.d\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "mov z27.d, z19.d\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z31.d, z19.d\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "cbz %[regs], 3f\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "8:\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "9:\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "10:\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "11:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
-                        ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
-                        ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
-                        ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
-                        ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
-                        ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
-                        ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
-                        ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
-                        ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
-                        ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
-                        ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
-                        ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
-                        ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
-                        ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
-                        ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
-                        ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
-                        ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
-                        ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
-                        ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
-                        ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
-                        ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
-                        ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
-                        ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
-                        ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
-                        ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
-                        ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
-                        ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
-                        ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
-                        ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
-                        ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
-                        ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
-                        ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
-                        ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
-                        ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
-                        ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
-                        ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
-                        ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
-                        ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
-                        ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
-                        ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
-                        ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
-                        "cbz %[blocks], 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
-                        "b.eq 13f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z14.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z8.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
-                        "b.eq 14f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z10.h, p4/z, [%[b_ptr1]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z12.h, p5/z, [%[b_ptr1], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "15:\n"
-                        ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
-                        ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
-                        ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
-                        ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
-                        ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
-                        ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
-                        ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
-                        ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
-                        ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
-                        ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
-                        ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
-                        ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
-                        ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
-                        ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
-                        ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
-                        ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
-                        "b 7f\n"
-                        "14:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "16:\n"
-                        ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
-                        ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
-                        ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
-                        ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
-                        ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
-                        ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
-                        ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
-                        ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
-                        ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
-                        ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
-                        ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
-                        ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
-                        ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
-                        ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
-                        ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
-                        ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
-                        "b 7f\n"
-                        "13:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z14.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z8.h, #0\n"
-                        "ld1h z13.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z15.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z12.h, z13.h, z14.h\n"
-                        "zip2 z13.h, z13.h, z14.h\n"
-                        "zip1 z14.h, z15.h, z8.h\n"
-                        "zip2 z15.h, z15.h, z8.h\n"
-                        "17:\n"
-                        ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
-                        ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
-                        ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
-                        ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
-                        ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
-                        ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
-                        ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
-                        ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
-                        ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
-                        ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
-                        ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
-                        ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
-                        ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
-                        ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
-                        ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
-                        ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
-                        "b 7f\n"
-                        "12:\n"
-                        "cbz %[odds], 7f\n"
-                        "mov z10.h, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z12.h, #0\n"
-                        "ld1h z9.h, p4/z, [%[b_ptr0]]\n"
-                        "ld1h z11.h, p5/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "zip1 z8.h, z9.h, z10.h\n"
-                        "zip2 z9.h, z9.h, z10.h\n"
-                        "zip1 z10.h, z11.h, z12.h\n"
-                        "zip2 z11.h, z11.h, z12.h\n"
-                        "18:\n"
-                        ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
-                        ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
-                        ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
-                        ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
-                        ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
-                        ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
-                        ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
-                        ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
-                        ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
-                        ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
-                        ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
-                        ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
-                        ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
-                        ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
-                        ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
-                        ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
-                        "7:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
deleted file mode 100644
index 741f200..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4.hpp
+++ /dev/null

@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_native_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, int ldb, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool);
-
-class native_fp16_mla_4VLx4
-{
-public:
-    typedef __fp16 operand_type;
-    typedef __fp16 result_type;
-
-    typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, int ldb, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<__fp16>() * 4;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 1;
-    }
-
-    static constexpr bool supports_append()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return true;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return true;
-    }
-
-
-
-    // Default to the generic kernel
-    kern_type kernel=sve_native_fp16_mla_4VLx4;
-
-    native_fp16_mla_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
deleted file mode 100644
index 14dd38b..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp
+++ /dev/null

@@ -1,3799 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_native_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, int ldb, __fp16 *C, int ldc, int M, int N, int K, const __fp16 *bias, Activation act, bool append) {
-    const long loops_count = ((K + 8) / 16) - 1;
-    K -= loops_count * 16;
-    const long regs_count = (K / 8) - 1;
-    K -= (regs_count + 1) * 8;
-    const long leftovers = K;
-    __fp16 nullbias[512];
-    if (!append && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<__fp16>() * sizeof(__fp16)));
-    }
-    __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
-    __fp16 maxval =   static_cast<__fp16>(std::numeric_limits<float>::infinity());
-    const __fp16 * const minptr = &minval;
-    const __fp16 * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<__fp16>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    for (int y=0; y<M; y+=4) {
-        const __fp16 * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(__fp16);
-
-        __fp16 *c_ptr0 = C + (y * ldc);
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = leftovers;
-            const __fp16 *a_ptr0 = a_ptr0_base;
-            const __fp16 *b_ptr0 = B + x0;
-            long ldbb = ldb * sizeof(__fp16);
-            const unsigned long ldcb = ldc * sizeof(__fp16);
-            const __fp16 *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(M-y) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "4:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "4:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "fmax z20.h, p7/m, z20.h, z14.h\n"
-                        "fmax z21.h, p7/m, z21.h, z14.h\n"
-                        "fmax z22.h, p7/m, z22.h, z14.h\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.h, p7/m, z23.h, z14.h\n"
-                        "fmin z20.h, p7/m, z20.h, z15.h\n"
-                        "fmin z21.h, p7/m, z21.h, z15.h\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.h, p7/m, z22.h, z15.h\n"
-                        "fmin z23.h, p7/m, z23.h, z15.h\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1h z20.h, p0, [c_ptr1]\n"
-                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "4:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "fmax z20.h, p7/m, z20.h, z14.h\n"
-                        "fmax z21.h, p7/m, z21.h, z14.h\n"
-                        "fmax z22.h, p7/m, z22.h, z14.h\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.h, p7/m, z23.h, z14.h\n"
-                        "fmin z20.h, p7/m, z20.h, z15.h\n"
-                        "fmin z21.h, p7/m, z21.h, z15.h\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.h, p7/m, z22.h, z15.h\n"
-                        "fmin z23.h, p7/m, z23.h, z15.h\n"
-                        "fmax z24.h, p7/m, z24.h, z14.h\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.h, p7/m, z25.h, z14.h\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.h, p7/m, z26.h, z14.h\n"
-                        "st1h z20.h, p0, [c_ptr1]\n"
-                        "fmin z24.h, p7/m, z24.h, z15.h\n"
-                        "fmin z25.h, p7/m, z25.h, z15.h\n"
-                        "fmax z27.h, p7/m, z27.h, z14.h\n"
-                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.h, p7/m, z26.h, z15.h\n"
-                        "fmin z27.h, p7/m, z27.h, z15.h\n"
-                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1h z24.h, p0, [c_ptr2]\n"
-                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.h, %[temp], %[leftovers]\n"
-                        "whilelt p0.h, %[temp], %[width]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "ptrue p7.h\n"
-                        "ld1h z16.h, p0/z, [%[biasptr]]\n"
-                        "whilelt p1.h, %[temp], %[width]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1]\n"
-                        "mov z28.d, z16.d\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3]\n"
-                        "whilelt p2.h, %[temp], %[width]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z29.d, z17.d\n"
-                        "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "inch %[temp], all, mul #1\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov z26.d, z18.d\n"
-                        "whilelt p3.h, %[temp], %[width]\n"
-                        "mov z30.d, z18.d\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z31.d, z19.d\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z28.h, z12.h, z3.h[7]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z29.h, z13.h, z3.h[7]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z30.h, z14.h, z3.h[7]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z31.h, z15.h, z3.h[7]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "fmla z28.h, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z29.h, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z30.h, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "fmla z31.h, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z28.h, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z29.h, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z30.h, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "fmla z31.h, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z28.h, z8.h, z7.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z29.h, z9.h, z7.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z30.h, z10.h, z7.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "fmla z31.h, z11.h, z7.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z28.h, z12.h, z7.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z29.h, z13.h, z7.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z30.h, z14.h, z7.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "fmla z31.h, z15.h, z7.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z28.h, z8.h, z7.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z29.h, z9.h, z7.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z30.h, z10.h, z7.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "fmla z31.h, z11.h, z7.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z28.h, z12.h, z7.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z29.h, z13.h, z7.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z30.h, z14.h, z7.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "fmla z31.h, z15.h, z7.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z28.h, z8.h, z7.h[6]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z29.h, z9.h, z7.h[6]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z30.h, z10.h, z7.h[6]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z31.h, z11.h, z7.h[6]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "fmla z28.h, z12.h, z7.h[7]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "fmla z29.h, z13.h, z7.h[7]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "fmla z30.h, z14.h, z7.h[7]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "fmla z31.h, z15.h, z7.h[7]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p7/z, [a_ptr1]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z6.h, p7/z, [a_ptr2]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "ld1rqh z7.h, p7/z, [a_ptr3]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z28.h, z12.h, z3.h[7]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z29.h, z13.h, z3.h[7]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z30.h, z14.h, z3.h[7]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z31.h, z15.h, z3.h[7]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z28.h, z8.h, z7.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "fmla z29.h, z9.h, z7.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z30.h, z10.h, z7.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "fmla z31.h, z11.h, z7.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z28.h, z12.h, z7.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z29.h, z13.h, z7.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z30.h, z14.h, z7.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "fmla z31.h, z15.h, z7.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z28.h, z8.h, z7.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z29.h, z9.h, z7.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z30.h, z10.h, z7.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "fmla z31.h, z11.h, z7.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z28.h, z12.h, z7.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z29.h, z13.h, z7.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z30.h, z14.h, z7.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "fmla z31.h, z15.h, z7.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z28.h, z8.h, z7.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z29.h, z9.h, z7.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z30.h, z10.h, z7.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "fmla z31.h, z11.h, z7.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z28.h, z12.h, z7.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z29.h, z13.h, z7.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z30.h, z14.h, z7.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "fmla z31.h, z15.h, z7.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z28.h, z8.h, z7.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z29.h, z9.h, z7.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z30.h, z10.h, z7.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z31.h, z11.h, z7.h[6]\n"
-                        "fmla z16.h, z12.h, z4.h[7]\n"
-                        "fmla z20.h, z12.h, z5.h[7]\n"
-                        "fmla z24.h, z12.h, z6.h[7]\n"
-                        "fmla z28.h, z12.h, z7.h[7]\n"
-                        "fmla z17.h, z13.h, z4.h[7]\n"
-                        "fmla z21.h, z13.h, z5.h[7]\n"
-                        "fmla z25.h, z13.h, z6.h[7]\n"
-                        "fmla z29.h, z13.h, z7.h[7]\n"
-                        "fmla z18.h, z14.h, z4.h[7]\n"
-                        "fmla z22.h, z14.h, z5.h[7]\n"
-                        "fmla z26.h, z14.h, z6.h[7]\n"
-                        "fmla z30.h, z14.h, z7.h[7]\n"
-                        "fmla z19.h, z15.h, z4.h[7]\n"
-                        "fmla z23.h, z15.h, z5.h[7]\n"
-                        "fmla z27.h, z15.h, z6.h[7]\n"
-                        "fmla z31.h, z15.h, z7.h[7]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla z16.h, z8.h, z0.h[0]\n"
-                        "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
-                        "fmla z20.h, z8.h, z1.h[0]\n"
-                        "ld1rqh z5.h, p6/z, [a_ptr1]\n"
-                        "fmla z24.h, z8.h, z2.h[0]\n"
-                        "ld1rqh z6.h, p6/z, [a_ptr2]\n"
-                        "fmla z28.h, z8.h, z3.h[0]\n"
-                        "ld1rqh z7.h, p6/z, [a_ptr3]\n"
-                        "fmla z17.h, z9.h, z0.h[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.h, z9.h, z1.h[0]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z25.h, z9.h, z2.h[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z29.h, z9.h, z3.h[0]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z22.h, z10.h, z1.h[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z26.h, z10.h, z2.h[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "fmla z30.h, z10.h, z3.h[0]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[0]\n"
-                        "fmla z23.h, z11.h, z1.h[0]\n"
-                        "fmla z27.h, z11.h, z2.h[0]\n"
-                        "fmla z31.h, z11.h, z3.h[0]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[1]\n"
-                        "fmla z24.h, z12.h, z2.h[1]\n"
-                        "fmla z28.h, z12.h, z3.h[1]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[1]\n"
-                        "fmla z21.h, z13.h, z1.h[1]\n"
-                        "fmla z25.h, z13.h, z2.h[1]\n"
-                        "fmla z29.h, z13.h, z3.h[1]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[1]\n"
-                        "fmla z22.h, z14.h, z1.h[1]\n"
-                        "fmla z26.h, z14.h, z2.h[1]\n"
-                        "fmla z30.h, z14.h, z3.h[1]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[1]\n"
-                        "fmla z23.h, z15.h, z1.h[1]\n"
-                        "fmla z27.h, z15.h, z2.h[1]\n"
-                        "fmla z31.h, z15.h, z3.h[1]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[2]\n"
-                        "fmla z24.h, z8.h, z2.h[2]\n"
-                        "fmla z28.h, z8.h, z3.h[2]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[2]\n"
-                        "fmla z21.h, z9.h, z1.h[2]\n"
-                        "fmla z25.h, z9.h, z2.h[2]\n"
-                        "fmla z29.h, z9.h, z3.h[2]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[2]\n"
-                        "fmla z22.h, z10.h, z1.h[2]\n"
-                        "fmla z26.h, z10.h, z2.h[2]\n"
-                        "fmla z30.h, z10.h, z3.h[2]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[2]\n"
-                        "fmla z23.h, z11.h, z1.h[2]\n"
-                        "fmla z27.h, z11.h, z2.h[2]\n"
-                        "fmla z31.h, z11.h, z3.h[2]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[3]\n"
-                        "fmla z24.h, z12.h, z2.h[3]\n"
-                        "fmla z28.h, z12.h, z3.h[3]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[3]\n"
-                        "fmla z21.h, z13.h, z1.h[3]\n"
-                        "fmla z25.h, z13.h, z2.h[3]\n"
-                        "fmla z29.h, z13.h, z3.h[3]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[3]\n"
-                        "fmla z22.h, z14.h, z1.h[3]\n"
-                        "fmla z26.h, z14.h, z2.h[3]\n"
-                        "fmla z30.h, z14.h, z3.h[3]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[3]\n"
-                        "fmla z23.h, z15.h, z1.h[3]\n"
-                        "fmla z27.h, z15.h, z2.h[3]\n"
-                        "fmla z31.h, z15.h, z3.h[3]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[4]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z8.h, z1.h[4]\n"
-                        "fmla z24.h, z8.h, z2.h[4]\n"
-                        "fmla z28.h, z8.h, z3.h[4]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z9.h, z0.h[4]\n"
-                        "fmla z21.h, z9.h, z1.h[4]\n"
-                        "fmla z25.h, z9.h, z2.h[4]\n"
-                        "fmla z29.h, z9.h, z3.h[4]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z10.h, z0.h[4]\n"
-                        "fmla z22.h, z10.h, z1.h[4]\n"
-                        "fmla z26.h, z10.h, z2.h[4]\n"
-                        "fmla z30.h, z10.h, z3.h[4]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z11.h, z0.h[4]\n"
-                        "fmla z23.h, z11.h, z1.h[4]\n"
-                        "fmla z27.h, z11.h, z2.h[4]\n"
-                        "fmla z31.h, z11.h, z3.h[4]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z0.h[5]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.h, z12.h, z1.h[5]\n"
-                        "fmla z24.h, z12.h, z2.h[5]\n"
-                        "fmla z28.h, z12.h, z3.h[5]\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.h, z13.h, z0.h[5]\n"
-                        "fmla z21.h, z13.h, z1.h[5]\n"
-                        "fmla z25.h, z13.h, z2.h[5]\n"
-                        "fmla z29.h, z13.h, z3.h[5]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.h, z14.h, z0.h[5]\n"
-                        "fmla z22.h, z14.h, z1.h[5]\n"
-                        "fmla z26.h, z14.h, z2.h[5]\n"
-                        "fmla z30.h, z14.h, z3.h[5]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.h, z15.h, z0.h[5]\n"
-                        "fmla z23.h, z15.h, z1.h[5]\n"
-                        "fmla z27.h, z15.h, z2.h[5]\n"
-                        "fmla z31.h, z15.h, z3.h[5]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z0.h[6]\n"
-                        "fmla z20.h, z8.h, z1.h[6]\n"
-                        "fmla z24.h, z8.h, z2.h[6]\n"
-                        "fmla z28.h, z8.h, z3.h[6]\n"
-                        "fmla z17.h, z9.h, z0.h[6]\n"
-                        "fmla z21.h, z9.h, z1.h[6]\n"
-                        "fmla z25.h, z9.h, z2.h[6]\n"
-                        "fmla z29.h, z9.h, z3.h[6]\n"
-                        "fmla z18.h, z10.h, z0.h[6]\n"
-                        "fmla z22.h, z10.h, z1.h[6]\n"
-                        "fmla z26.h, z10.h, z2.h[6]\n"
-                        "fmla z30.h, z10.h, z3.h[6]\n"
-                        "fmla z19.h, z11.h, z0.h[6]\n"
-                        "fmla z23.h, z11.h, z1.h[6]\n"
-                        "fmla z27.h, z11.h, z2.h[6]\n"
-                        "fmla z31.h, z11.h, z3.h[6]\n"
-                        "fmla z16.h, z12.h, z0.h[7]\n"
-                        "fmla z20.h, z12.h, z1.h[7]\n"
-                        "fmla z24.h, z12.h, z2.h[7]\n"
-                        "fmla z28.h, z12.h, z3.h[7]\n"
-                        "fmla z17.h, z13.h, z0.h[7]\n"
-                        "fmla z21.h, z13.h, z1.h[7]\n"
-                        "fmla z25.h, z13.h, z2.h[7]\n"
-                        "fmla z29.h, z13.h, z3.h[7]\n"
-                        "fmla z18.h, z14.h, z0.h[7]\n"
-                        "fmla z22.h, z14.h, z1.h[7]\n"
-                        "fmla z26.h, z14.h, z2.h[7]\n"
-                        "fmla z30.h, z14.h, z3.h[7]\n"
-                        "fmla z19.h, z15.h, z0.h[7]\n"
-                        "fmla z23.h, z15.h, z1.h[7]\n"
-                        "fmla z27.h, z15.h, z2.h[7]\n"
-                        "fmla z31.h, z15.h, z3.h[7]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[0]\n"
-                        "fmla z20.h, z8.h, z5.h[0]\n"
-                        "fmla z24.h, z8.h, z6.h[0]\n"
-                        "fmla z28.h, z8.h, z7.h[0]\n"
-                        "fmla z17.h, z9.h, z4.h[0]\n"
-                        "fmla z21.h, z9.h, z5.h[0]\n"
-                        "fmla z25.h, z9.h, z6.h[0]\n"
-                        "fmla z29.h, z9.h, z7.h[0]\n"
-                        "fmla z18.h, z10.h, z4.h[0]\n"
-                        "fmla z22.h, z10.h, z5.h[0]\n"
-                        "fmla z26.h, z10.h, z6.h[0]\n"
-                        "fmla z30.h, z10.h, z7.h[0]\n"
-                        "fmla z19.h, z11.h, z4.h[0]\n"
-                        "fmla z23.h, z11.h, z5.h[0]\n"
-                        "fmla z27.h, z11.h, z6.h[0]\n"
-                        "fmla z31.h, z11.h, z7.h[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[1]\n"
-                        "fmla z20.h, z12.h, z5.h[1]\n"
-                        "fmla z24.h, z12.h, z6.h[1]\n"
-                        "fmla z28.h, z12.h, z7.h[1]\n"
-                        "fmla z17.h, z13.h, z4.h[1]\n"
-                        "fmla z21.h, z13.h, z5.h[1]\n"
-                        "fmla z25.h, z13.h, z6.h[1]\n"
-                        "fmla z29.h, z13.h, z7.h[1]\n"
-                        "fmla z18.h, z14.h, z4.h[1]\n"
-                        "fmla z22.h, z14.h, z5.h[1]\n"
-                        "fmla z26.h, z14.h, z6.h[1]\n"
-                        "fmla z30.h, z14.h, z7.h[1]\n"
-                        "fmla z19.h, z15.h, z4.h[1]\n"
-                        "fmla z23.h, z15.h, z5.h[1]\n"
-                        "fmla z27.h, z15.h, z6.h[1]\n"
-                        "fmla z31.h, z15.h, z7.h[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[2]\n"
-                        "fmla z20.h, z8.h, z5.h[2]\n"
-                        "fmla z24.h, z8.h, z6.h[2]\n"
-                        "fmla z28.h, z8.h, z7.h[2]\n"
-                        "fmla z17.h, z9.h, z4.h[2]\n"
-                        "fmla z21.h, z9.h, z5.h[2]\n"
-                        "fmla z25.h, z9.h, z6.h[2]\n"
-                        "fmla z29.h, z9.h, z7.h[2]\n"
-                        "fmla z18.h, z10.h, z4.h[2]\n"
-                        "fmla z22.h, z10.h, z5.h[2]\n"
-                        "fmla z26.h, z10.h, z6.h[2]\n"
-                        "fmla z30.h, z10.h, z7.h[2]\n"
-                        "fmla z19.h, z11.h, z4.h[2]\n"
-                        "fmla z23.h, z11.h, z5.h[2]\n"
-                        "fmla z27.h, z11.h, z6.h[2]\n"
-                        "fmla z31.h, z11.h, z7.h[2]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[3]\n"
-                        "fmla z20.h, z12.h, z5.h[3]\n"
-                        "fmla z24.h, z12.h, z6.h[3]\n"
-                        "fmla z28.h, z12.h, z7.h[3]\n"
-                        "fmla z17.h, z13.h, z4.h[3]\n"
-                        "fmla z21.h, z13.h, z5.h[3]\n"
-                        "fmla z25.h, z13.h, z6.h[3]\n"
-                        "fmla z29.h, z13.h, z7.h[3]\n"
-                        "fmla z18.h, z14.h, z4.h[3]\n"
-                        "fmla z22.h, z14.h, z5.h[3]\n"
-                        "fmla z26.h, z14.h, z6.h[3]\n"
-                        "fmla z30.h, z14.h, z7.h[3]\n"
-                        "fmla z19.h, z15.h, z4.h[3]\n"
-                        "fmla z23.h, z15.h, z5.h[3]\n"
-                        "fmla z27.h, z15.h, z6.h[3]\n"
-                        "fmla z31.h, z15.h, z7.h[3]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[4]\n"
-                        "fmla z20.h, z8.h, z5.h[4]\n"
-                        "fmla z24.h, z8.h, z6.h[4]\n"
-                        "fmla z28.h, z8.h, z7.h[4]\n"
-                        "fmla z17.h, z9.h, z4.h[4]\n"
-                        "fmla z21.h, z9.h, z5.h[4]\n"
-                        "fmla z25.h, z9.h, z6.h[4]\n"
-                        "fmla z29.h, z9.h, z7.h[4]\n"
-                        "fmla z18.h, z10.h, z4.h[4]\n"
-                        "fmla z22.h, z10.h, z5.h[4]\n"
-                        "fmla z26.h, z10.h, z6.h[4]\n"
-                        "fmla z30.h, z10.h, z7.h[4]\n"
-                        "fmla z19.h, z11.h, z4.h[4]\n"
-                        "fmla z23.h, z11.h, z5.h[4]\n"
-                        "fmla z27.h, z11.h, z6.h[4]\n"
-                        "fmla z31.h, z11.h, z7.h[4]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1h z12.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z13.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z14.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z15.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z12.h, z4.h[5]\n"
-                        "fmla z20.h, z12.h, z5.h[5]\n"
-                        "fmla z24.h, z12.h, z6.h[5]\n"
-                        "fmla z28.h, z12.h, z7.h[5]\n"
-                        "fmla z17.h, z13.h, z4.h[5]\n"
-                        "fmla z21.h, z13.h, z5.h[5]\n"
-                        "fmla z25.h, z13.h, z6.h[5]\n"
-                        "fmla z29.h, z13.h, z7.h[5]\n"
-                        "fmla z18.h, z14.h, z4.h[5]\n"
-                        "fmla z22.h, z14.h, z5.h[5]\n"
-                        "fmla z26.h, z14.h, z6.h[5]\n"
-                        "fmla z30.h, z14.h, z7.h[5]\n"
-                        "fmla z19.h, z15.h, z4.h[5]\n"
-                        "fmla z23.h, z15.h, z5.h[5]\n"
-                        "fmla z27.h, z15.h, z6.h[5]\n"
-                        "fmla z31.h, z15.h, z7.h[5]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1h z8.h, p0/z, [%[b_ptr0]]\n"
-                        "ld1h z9.h, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1h z10.h, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1h z11.h, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.h, z8.h, z4.h[6]\n"
-                        "fmla z20.h, z8.h, z5.h[6]\n"
-                        "fmla z24.h, z8.h, z6.h[6]\n"
-                        "fmla z28.h, z8.h, z7.h[6]\n"
-                        "fmla z17.h, z9.h, z4.h[6]\n"
-                        "fmla z21.h, z9.h, z5.h[6]\n"
-                        "fmla z25.h, z9.h, z6.h[6]\n"
-                        "fmla z29.h, z9.h, z7.h[6]\n"
-                        "fmla z18.h, z10.h, z4.h[6]\n"
-                        "fmla z22.h, z10.h, z5.h[6]\n"
-                        "fmla z26.h, z10.h, z6.h[6]\n"
-                        "fmla z30.h, z10.h, z7.h[6]\n"
-                        "fmla z19.h, z11.h, z4.h[6]\n"
-                        "fmla z23.h, z11.h, z5.h[6]\n"
-                        "fmla z27.h, z11.h, z6.h[6]\n"
-                        "fmla z31.h, z11.h, z7.h[6]\n"
-                        "4:\n"
-                        "ld1rh z14.h, p7/z, [%[minptr]]\n"
-                        "ld1rh z15.h, p7/z, [%[maxptr]]\n"
-                        "fmax z16.h, p7/m, z16.h, z14.h\n"
-                        "fmax z17.h, p7/m, z17.h, z14.h\n"
-                        "fmax z18.h, p7/m, z18.h, z14.h\n"
-                        "fmax z19.h, p7/m, z19.h, z14.h\n"
-                        "fmin z16.h, p7/m, z16.h, z15.h\n"
-                        "fmin z17.h, p7/m, z17.h, z15.h\n"
-                        "fmin z18.h, p7/m, z18.h, z15.h\n"
-                        "fmin z19.h, p7/m, z19.h, z15.h\n"
-                        "st1h z16.h, p0, [%[c_ptr0]]\n"
-                        "fmax z20.h, p7/m, z20.h, z14.h\n"
-                        "fmax z21.h, p7/m, z21.h, z14.h\n"
-                        "fmax z22.h, p7/m, z22.h, z14.h\n"
-                        "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.h, p7/m, z23.h, z14.h\n"
-                        "fmin z20.h, p7/m, z20.h, z15.h\n"
-                        "fmin z21.h, p7/m, z21.h, z15.h\n"
-                        "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.h, p7/m, z22.h, z15.h\n"
-                        "fmin z23.h, p7/m, z23.h, z15.h\n"
-                        "fmax z24.h, p7/m, z24.h, z14.h\n"
-                        "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.h, p7/m, z25.h, z14.h\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.h, p7/m, z26.h, z14.h\n"
-                        "st1h z20.h, p0, [c_ptr1]\n"
-                        "fmin z24.h, p7/m, z24.h, z15.h\n"
-                        "fmin z25.h, p7/m, z25.h, z15.h\n"
-                        "fmax z27.h, p7/m, z27.h, z14.h\n"
-                        "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.h, p7/m, z26.h, z15.h\n"
-                        "fmax z28.h, p7/m, z28.h, z14.h\n"
-                        "fmax z29.h, p7/m, z29.h, z14.h\n"
-                        "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z27.h, p7/m, z27.h, z15.h\n"
-                        "fmax z30.h, p7/m, z30.h, z14.h\n"
-                        "fmin z28.h, p7/m, z28.h, z15.h\n"
-                        "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z29.h, p7/m, z29.h, z15.h\n"
-                        "fmax z31.h, p7/m, z31.h, z14.h\n"
-                        "fmin z30.h, p7/m, z30.h, z15.h\n"
-                        "st1h z24.h, p0, [c_ptr2]\n"
-                        "fmin z31.h, p7/m, z31.h, z15.h\n"
-                        "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1h z28.h, p0, [c_ptr3]\n"
-                        "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
deleted file mode 100644
index 3fc0e5f..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp
+++ /dev/null

@@ -1,2055 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_native_fp32_mla_4VLx4(const float *A, int lda, const float *B, int ldb, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool append) {
-    const long loops_count = ((K + 4) / 8) - 1;
-    K -= loops_count * 8;
-    const long regs_count = (K / 4) - 1;
-    K -= (regs_count + 1) * 4;
-    const long leftovers = K;
-    float nullbias[256];
-    if (!append && !bias) {
-        memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
-    }
-    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
-    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
-    const float * const minptr = &minval;
-    const float * const maxptr = &maxval;
-
-    switch(act.type)
-    {
-        default:
-        case Activation::Type::None:
-            break;
-        case Activation::Type::BoundedReLU:
-            maxval = static_cast<float>(act.param1);
-            /* fall through */
-        case Activation::Type::ReLU:
-            minval = 0.0f;
-            break;
-    }
-
-    for (int y=0; y<M; y+=4) {
-        const float * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(float);
-
-        float *c_ptr0 = C + (y * ldc);
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = leftovers;
-            const float *a_ptr0 = a_ptr0_base;
-            const float *b_ptr0 = B + x0;
-            long ldbb = ldb * sizeof(float);
-            const unsigned long ldcb = ldc * sizeof(float);
-            const float *biasptr = bias ? bias+x0 : nullbias;
-
-            switch(M-y) {
-                case 1:
-                    __asm __volatile (
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "4:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "mov z22.d, z18.d\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "4:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov z26.d, z18.d\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "4:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "whilelt p6.s, %[temp], %[leftovers]\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.s\n"
-                        "ld1w z16.s, p0/z, [%[biasptr]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z20.d, z16.d\n"
-                        "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
-                        "mov z24.d, z16.d\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1]\n"
-                        "mov z28.d, z16.d\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "mov z21.d, z17.d\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "mov z25.d, z17.d\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z29.d, z17.d\n"
-                        "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "mov z22.d, z18.d\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "mov z26.d, z18.d\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "mov z30.d, z18.d\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z23.d, z19.d\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "mov z27.d, z19.d\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "mov z31.d, z19.d\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z28.s, z12.s, z3.s[3]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z29.s, z13.s, z3.s[3]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z30.s, z14.s, z3.s[3]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
-                        "fmla z31.s, z15.s, z3.s[3]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "fmla z28.s, z8.s, z7.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z29.s, z9.s, z7.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z30.s, z10.s, z7.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "fmla z31.s, z11.s, z7.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z28.s, z12.s, z7.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z29.s, z13.s, z7.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z30.s, z14.s, z7.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "fmla z31.s, z15.s, z7.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z28.s, z8.s, z7.s[2]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z29.s, z9.s, z7.s[2]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z30.s, z10.s, z7.s[2]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z31.s, z11.s, z7.s[2]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "fmla z28.s, z12.s, z7.s[3]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "fmla z29.s, z13.s, z7.s[3]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "fmla z30.s, z14.s, z7.s[3]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "fmla z31.s, z15.s, z7.s[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "cbz %[regs], 3f\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z5.s, p7/z, [a_ptr1]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z6.s, p7/z, [a_ptr2]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "ld1rqw z7.s, p7/z, [a_ptr3]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z28.s, z12.s, z3.s[3]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z29.s, z13.s, z3.s[3]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z30.s, z14.s, z3.s[3]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
-                        "fmla z31.s, z15.s, z3.s[3]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "fmla z28.s, z8.s, z7.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "fmla z29.s, z9.s, z7.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z30.s, z10.s, z7.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "fmla z31.s, z11.s, z7.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z28.s, z12.s, z7.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z29.s, z13.s, z7.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z30.s, z14.s, z7.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "fmla z31.s, z15.s, z7.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z28.s, z8.s, z7.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z29.s, z9.s, z7.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z30.s, z10.s, z7.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z31.s, z11.s, z7.s[2]\n"
-                        "fmla z16.s, z12.s, z4.s[3]\n"
-                        "fmla z20.s, z12.s, z5.s[3]\n"
-                        "fmla z24.s, z12.s, z6.s[3]\n"
-                        "fmla z28.s, z12.s, z7.s[3]\n"
-                        "fmla z17.s, z13.s, z4.s[3]\n"
-                        "fmla z21.s, z13.s, z5.s[3]\n"
-                        "fmla z25.s, z13.s, z6.s[3]\n"
-                        "fmla z29.s, z13.s, z7.s[3]\n"
-                        "fmla z18.s, z14.s, z4.s[3]\n"
-                        "fmla z22.s, z14.s, z5.s[3]\n"
-                        "fmla z26.s, z14.s, z6.s[3]\n"
-                        "fmla z30.s, z14.s, z7.s[3]\n"
-                        "fmla z19.s, z15.s, z4.s[3]\n"
-                        "fmla z23.s, z15.s, z5.s[3]\n"
-                        "fmla z27.s, z15.s, z6.s[3]\n"
-                        "fmla z31.s, z15.s, z7.s[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "b 4f\n"
-                        "3:\n"
-                        "fmla z16.s, z8.s, z0.s[0]\n"
-                        "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
-                        "fmla z20.s, z8.s, z1.s[0]\n"
-                        "ld1rqw z5.s, p6/z, [a_ptr1]\n"
-                        "fmla z24.s, z8.s, z2.s[0]\n"
-                        "ld1rqw z6.s, p6/z, [a_ptr2]\n"
-                        "fmla z28.s, z8.s, z3.s[0]\n"
-                        "ld1rqw z7.s, p6/z, [a_ptr3]\n"
-                        "fmla z17.s, z9.s, z0.s[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z21.s, z9.s, z1.s[0]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z25.s, z9.s, z2.s[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "fmla z29.s, z9.s, z3.s[0]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z10.s, z0.s[0]\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "fmla z22.s, z10.s, z1.s[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "fmla z26.s, z10.s, z2.s[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "fmla z30.s, z10.s, z3.s[0]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z11.s, z0.s[0]\n"
-                        "fmla z23.s, z11.s, z1.s[0]\n"
-                        "fmla z27.s, z11.s, z2.s[0]\n"
-                        "fmla z31.s, z11.s, z3.s[0]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z0.s[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "fmla z20.s, z12.s, z1.s[1]\n"
-                        "fmla z24.s, z12.s, z2.s[1]\n"
-                        "fmla z28.s, z12.s, z3.s[1]\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "fmla z17.s, z13.s, z0.s[1]\n"
-                        "fmla z21.s, z13.s, z1.s[1]\n"
-                        "fmla z25.s, z13.s, z2.s[1]\n"
-                        "fmla z29.s, z13.s, z3.s[1]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "fmla z18.s, z14.s, z0.s[1]\n"
-                        "fmla z22.s, z14.s, z1.s[1]\n"
-                        "fmla z26.s, z14.s, z2.s[1]\n"
-                        "fmla z30.s, z14.s, z3.s[1]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "fmla z19.s, z15.s, z0.s[1]\n"
-                        "fmla z23.s, z15.s, z1.s[1]\n"
-                        "fmla z27.s, z15.s, z2.s[1]\n"
-                        "fmla z31.s, z15.s, z3.s[1]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z0.s[2]\n"
-                        "fmla z20.s, z8.s, z1.s[2]\n"
-                        "fmla z24.s, z8.s, z2.s[2]\n"
-                        "fmla z28.s, z8.s, z3.s[2]\n"
-                        "fmla z17.s, z9.s, z0.s[2]\n"
-                        "fmla z21.s, z9.s, z1.s[2]\n"
-                        "fmla z25.s, z9.s, z2.s[2]\n"
-                        "fmla z29.s, z9.s, z3.s[2]\n"
-                        "fmla z18.s, z10.s, z0.s[2]\n"
-                        "fmla z22.s, z10.s, z1.s[2]\n"
-                        "fmla z26.s, z10.s, z2.s[2]\n"
-                        "fmla z30.s, z10.s, z3.s[2]\n"
-                        "fmla z19.s, z11.s, z0.s[2]\n"
-                        "fmla z23.s, z11.s, z1.s[2]\n"
-                        "fmla z27.s, z11.s, z2.s[2]\n"
-                        "fmla z31.s, z11.s, z3.s[2]\n"
-                        "fmla z16.s, z12.s, z0.s[3]\n"
-                        "fmla z20.s, z12.s, z1.s[3]\n"
-                        "fmla z24.s, z12.s, z2.s[3]\n"
-                        "fmla z28.s, z12.s, z3.s[3]\n"
-                        "fmla z17.s, z13.s, z0.s[3]\n"
-                        "fmla z21.s, z13.s, z1.s[3]\n"
-                        "fmla z25.s, z13.s, z2.s[3]\n"
-                        "fmla z29.s, z13.s, z3.s[3]\n"
-                        "fmla z18.s, z14.s, z0.s[3]\n"
-                        "fmla z22.s, z14.s, z1.s[3]\n"
-                        "fmla z26.s, z14.s, z2.s[3]\n"
-                        "fmla z30.s, z14.s, z3.s[3]\n"
-                        "fmla z19.s, z15.s, z0.s[3]\n"
-                        "fmla z23.s, z15.s, z1.s[3]\n"
-                        "fmla z27.s, z15.s, z2.s[3]\n"
-                        "fmla z31.s, z15.s, z3.s[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[0]\n"
-                        "fmla z20.s, z8.s, z5.s[0]\n"
-                        "fmla z24.s, z8.s, z6.s[0]\n"
-                        "fmla z28.s, z8.s, z7.s[0]\n"
-                        "fmla z17.s, z9.s, z4.s[0]\n"
-                        "fmla z21.s, z9.s, z5.s[0]\n"
-                        "fmla z25.s, z9.s, z6.s[0]\n"
-                        "fmla z29.s, z9.s, z7.s[0]\n"
-                        "fmla z18.s, z10.s, z4.s[0]\n"
-                        "fmla z22.s, z10.s, z5.s[0]\n"
-                        "fmla z26.s, z10.s, z6.s[0]\n"
-                        "fmla z30.s, z10.s, z7.s[0]\n"
-                        "fmla z19.s, z11.s, z4.s[0]\n"
-                        "fmla z23.s, z11.s, z5.s[0]\n"
-                        "fmla z27.s, z11.s, z6.s[0]\n"
-                        "fmla z31.s, z11.s, z7.s[0]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "ld1w z12.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z13.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z14.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z15.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z12.s, z4.s[1]\n"
-                        "fmla z20.s, z12.s, z5.s[1]\n"
-                        "fmla z24.s, z12.s, z6.s[1]\n"
-                        "fmla z28.s, z12.s, z7.s[1]\n"
-                        "fmla z17.s, z13.s, z4.s[1]\n"
-                        "fmla z21.s, z13.s, z5.s[1]\n"
-                        "fmla z25.s, z13.s, z6.s[1]\n"
-                        "fmla z29.s, z13.s, z7.s[1]\n"
-                        "fmla z18.s, z14.s, z4.s[1]\n"
-                        "fmla z22.s, z14.s, z5.s[1]\n"
-                        "fmla z26.s, z14.s, z6.s[1]\n"
-                        "fmla z30.s, z14.s, z7.s[1]\n"
-                        "fmla z19.s, z15.s, z4.s[1]\n"
-                        "fmla z23.s, z15.s, z5.s[1]\n"
-                        "fmla z27.s, z15.s, z6.s[1]\n"
-                        "fmla z31.s, z15.s, z7.s[1]\n"
-                        "b.eq 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1w z8.s, p0/z, [%[b_ptr0]]\n"
-                        "ld1w z9.s, p1/z, [%[b_ptr0], #1, MUL VL]\n"
-                        "ld1w z10.s, p2/z, [%[b_ptr0], #2, MUL VL]\n"
-                        "ld1w z11.s, p3/z, [%[b_ptr0], #3, MUL VL]\n"
-                        "fmla z16.s, z8.s, z4.s[2]\n"
-                        "fmla z20.s, z8.s, z5.s[2]\n"
-                        "fmla z24.s, z8.s, z6.s[2]\n"
-                        "fmla z28.s, z8.s, z7.s[2]\n"
-                        "fmla z17.s, z9.s, z4.s[2]\n"
-                        "fmla z21.s, z9.s, z5.s[2]\n"
-                        "fmla z25.s, z9.s, z6.s[2]\n"
-                        "fmla z29.s, z9.s, z7.s[2]\n"
-                        "fmla z18.s, z10.s, z4.s[2]\n"
-                        "fmla z22.s, z10.s, z5.s[2]\n"
-                        "fmla z26.s, z10.s, z6.s[2]\n"
-                        "fmla z30.s, z10.s, z7.s[2]\n"
-                        "fmla z19.s, z11.s, z4.s[2]\n"
-                        "fmla z23.s, z11.s, z5.s[2]\n"
-                        "fmla z27.s, z11.s, z6.s[2]\n"
-                        "fmla z31.s, z11.s, z7.s[2]\n"
-                        "4:\n"
-                        "ld1rw z14.s, p7/z, [%[minptr]]\n"
-                        "ld1rw z15.s, p7/z, [%[maxptr]]\n"
-                        "fmax z16.s, p7/m, z16.s, z14.s\n"
-                        "fmax z17.s, p7/m, z17.s, z14.s\n"
-                        "fmax z18.s, p7/m, z18.s, z14.s\n"
-                        "fmax z19.s, p7/m, z19.s, z14.s\n"
-                        "fmin z16.s, p7/m, z16.s, z15.s\n"
-                        "fmin z17.s, p7/m, z17.s, z15.s\n"
-                        "fmin z18.s, p7/m, z18.s, z15.s\n"
-                        "fmin z19.s, p7/m, z19.s, z15.s\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "fmax z20.s, p7/m, z20.s, z14.s\n"
-                        "fmax z21.s, p7/m, z21.s, z14.s\n"
-                        "fmax z22.s, p7/m, z22.s, z14.s\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "fmax z23.s, p7/m, z23.s, z14.s\n"
-                        "fmin z20.s, p7/m, z20.s, z15.s\n"
-                        "fmin z21.s, p7/m, z21.s, z15.s\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "fmin z22.s, p7/m, z22.s, z15.s\n"
-                        "fmin z23.s, p7/m, z23.s, z15.s\n"
-                        "fmax z24.s, p7/m, z24.s, z14.s\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "fmax z25.s, p7/m, z25.s, z14.s\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "fmax z26.s, p7/m, z26.s, z14.s\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "fmin z24.s, p7/m, z24.s, z15.s\n"
-                        "fmin z25.s, p7/m, z25.s, z15.s\n"
-                        "fmax z27.s, p7/m, z27.s, z14.s\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "fmin z26.s, p7/m, z26.s, z15.s\n"
-                        "fmax z28.s, p7/m, z28.s, z14.s\n"
-                        "fmax z29.s, p7/m, z29.s, z14.s\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "fmin z27.s, p7/m, z27.s, z15.s\n"
-                        "fmax z30.s, p7/m, z30.s, z14.s\n"
-                        "fmin z28.s, p7/m, z28.s, z15.s\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "fmin z29.s, p7/m, z29.s, z15.s\n"
-                        "fmax z31.s, p7/m, z31.s, z14.s\n"
-                        "fmin z30.s, p7/m, z30.s, z15.s\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "fmin z31.s, p7/m, z31.s, z15.s\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
deleted file mode 100644
index 1b9d131..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4.hpp
+++ /dev/null

@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include <cstdint>
-
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_native_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-
-class native_s8s32_dot_4VLx4
-{
-public:
-    typedef int8_t operand_type;
-    typedef int32_t result_type;
-
-    typedef void (*kern_type)(const int8_t *, int, const int8_t *, int ldb, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<int32_t>() * 4;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 4;
-    }
-
-    static constexpr bool supports_append()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return false;
-    }
-
-
-
-    // Default to the generic kernel
-    kern_type kernel=sve_native_s8s32_dot_4VLx4;
-
-    native_s8s32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
deleted file mode 100644
index 95cf88a..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp
+++ /dev/null

@@ -1,4479 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_native_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int ldb, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool append) {
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long leftovers = K;
-    const long blocks_count = K / 4;
-    const long odds_count = K - (blocks_count * 4);
-
-    for (int y=0; y<M; y+=4) {
-        const int8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(int8_t);
-
-        int32_t *c_ptr0 = C + (y * ldc);
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const int8_t *a_ptr0 = a_ptr0_base;
-            const int8_t *b_ptr0 = B + x0;
-            const int8_t *b_ptr1 = b_ptr0 + ldb;
-            const int8_t *b_ptr2 = b_ptr1 + ldb;
-            const int8_t *b_ptr3 = b_ptr2 + ldb;
-            long ldbb = ldb * sizeof(int8_t) * 4;
-            const unsigned long ldcb = ldc * sizeof(int32_t);
-
-            switch(M-y) {
-                case 1:
-                    __asm __volatile (
-                        "mov z16.s, #0\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "mov z17.s, #0\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "mov z18.s, #0\n"
-                        "whilelt p4.b, %[temp], %[width]\n"
-                        "mov z19.s, #0\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "cbz %[regs], 3f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 8f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "9:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "8:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "10:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "12:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "11:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "13:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 15f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "15:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "14:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "16:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 18f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "18:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "17:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "19:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "cbz %[blocks], 20f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "b.eq 21f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "b.eq 22f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 23f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 24f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "24:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "23:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "25:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "b 7f\n"
-                        "22:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 27f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "27:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "26:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "28:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "b 7f\n"
-                        "21:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 30f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "30:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "29:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "31:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "b 7f\n"
-                        "20:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 33f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "33:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "32:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "34:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "7:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "mov z16.s, #0\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "mov z17.s, #0\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "mov z18.s, #0\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "mov z19.s, #0\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "mov z20.s, #0\n"
-                        "whilelt p4.b, %[temp], %[width]\n"
-                        "mov z21.s, #0\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "ptrue p7.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "cbz %[regs], 3f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 8f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "9:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "8:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "10:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "12:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "11:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "13:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 15f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "15:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "14:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "16:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 18f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "18:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "17:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "19:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "cbz %[blocks], 20f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "b.eq 21f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "b.eq 22f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 23f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 24f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "24:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "23:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "25:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "b 7f\n"
-                        "22:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 27f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "27:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "26:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "28:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "b 7f\n"
-                        "21:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 30f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "30:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "29:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "31:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "b 7f\n"
-                        "20:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 33f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "33:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "32:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "34:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "7:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "mov z16.s, #0\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "mov z17.s, #0\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov z18.s, #0\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "mov z19.s, #0\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "mov z20.s, #0\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "mov z21.s, #0\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "mov z22.s, #0\n"
-                        "whilelt p4.b, %[temp], %[width]\n"
-                        "mov z23.s, #0\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z24.s, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "mov z25.s, #0\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "mov z26.s, #0\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "mov z27.s, #0\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "ptrue p7.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "cbz %[regs], 3f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 8f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "9:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "8:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "10:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "12:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "11:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "13:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 15f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "15:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "14:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "16:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 18f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "18:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "17:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "19:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "cbz %[blocks], 20f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "b.eq 21f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "b.eq 22f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 23f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 24f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "24:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "23:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "25:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "b 7f\n"
-                        "22:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 27f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "27:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "26:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "28:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "b 7f\n"
-                        "21:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 30f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "30:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "29:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "31:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "b 7f\n"
-                        "20:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 33f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "33:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "32:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "34:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "7:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "mov z16.s, #0\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "mov z17.s, #0\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov z18.s, #0\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "mov z19.s, #0\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "mov z20.s, #0\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "mov z21.s, #0\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "mov z22.s, #0\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "mov z23.s, #0\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "mov z24.s, #0\n"
-                        "whilelt p4.b, %[temp], %[width]\n"
-                        "mov z25.s, #0\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z26.s, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "mov z27.s, #0\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "mov z28.s, #0\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "mov z29.s, #0\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "mov z30.s, #0\n"
-                        "ptrue p7.b\n"
-                        "mov z31.s, #0\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z28.s, z8.b, z7.b[0]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "sdot z31.s, z11.b, z7.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z28.s, z12.b, z7.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "sdot z31.s, z15.b, z7.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z28.s, z8.b, z7.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z30.s, z10.b, z7.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z31.s, z11.b, z7.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z28.s, z12.b, z7.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z30.s, z14.b, z7.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "sdot z31.s, z15.b, z7.b[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "cbz %[regs], 3f\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "sdot z28.s, z8.b, z7.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "sdot z31.s, z11.b, z7.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z28.s, z12.b, z7.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "sdot z31.s, z15.b, z7.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z28.s, z8.b, z7.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z30.s, z10.b, z7.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z31.s, z11.b, z7.b[2]\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z28.s, z12.b, z7.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z30.s, z14.b, z7.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "sdot z31.s, z15.b, z7.b[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 8f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "9:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "8:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "10:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "12:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "11:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "13:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 15f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "15:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "14:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "16:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 18f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "18:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "17:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "19:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        "sdot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "sdot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "sdot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "sdot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "sdot z25.s, z9.b, z2.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "sdot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "sdot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "sdot z26.s, z10.b, z2.b[0]\n"
-                        "sdot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "sdot z23.s, z11.b, z1.b[0]\n"
-                        "sdot z27.s, z11.b, z2.b[0]\n"
-                        "sdot z31.s, z11.b, z3.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "sdot z20.s, z12.b, z1.b[1]\n"
-                        "sdot z24.s, z12.b, z2.b[1]\n"
-                        "sdot z28.s, z12.b, z3.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "sdot z17.s, z13.b, z0.b[1]\n"
-                        "sdot z21.s, z13.b, z1.b[1]\n"
-                        "sdot z25.s, z13.b, z2.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "sdot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "sdot z18.s, z14.b, z0.b[1]\n"
-                        "sdot z22.s, z14.b, z1.b[1]\n"
-                        "sdot z26.s, z14.b, z2.b[1]\n"
-                        "sdot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "sdot z19.s, z15.b, z0.b[1]\n"
-                        "sdot z23.s, z15.b, z1.b[1]\n"
-                        "sdot z27.s, z15.b, z2.b[1]\n"
-                        "sdot z31.s, z15.b, z3.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "sdot z16.s, z8.b, z0.b[2]\n"
-                        "sdot z20.s, z8.b, z1.b[2]\n"
-                        "sdot z24.s, z8.b, z2.b[2]\n"
-                        "sdot z28.s, z8.b, z3.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "sdot z17.s, z9.b, z0.b[2]\n"
-                        "sdot z21.s, z9.b, z1.b[2]\n"
-                        "sdot z25.s, z9.b, z2.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z29.s, z9.b, z3.b[2]\n"
-                        "sdot z18.s, z10.b, z0.b[2]\n"
-                        "sdot z22.s, z10.b, z1.b[2]\n"
-                        "sdot z26.s, z10.b, z2.b[2]\n"
-                        "sdot z30.s, z10.b, z3.b[2]\n"
-                        "sdot z19.s, z11.b, z0.b[2]\n"
-                        "sdot z23.s, z11.b, z1.b[2]\n"
-                        "sdot z27.s, z11.b, z2.b[2]\n"
-                        "sdot z31.s, z11.b, z3.b[2]\n"
-                        "sdot z16.s, z12.b, z0.b[3]\n"
-                        "sdot z20.s, z12.b, z1.b[3]\n"
-                        "sdot z24.s, z12.b, z2.b[3]\n"
-                        "sdot z28.s, z12.b, z3.b[3]\n"
-                        "sdot z17.s, z13.b, z0.b[3]\n"
-                        "sdot z21.s, z13.b, z1.b[3]\n"
-                        "sdot z25.s, z13.b, z2.b[3]\n"
-                        "sdot z29.s, z13.b, z3.b[3]\n"
-                        "sdot z18.s, z14.b, z0.b[3]\n"
-                        "sdot z22.s, z14.b, z1.b[3]\n"
-                        "sdot z26.s, z14.b, z2.b[3]\n"
-                        "sdot z30.s, z14.b, z3.b[3]\n"
-                        "sdot z19.s, z15.b, z0.b[3]\n"
-                        "sdot z23.s, z15.b, z1.b[3]\n"
-                        "sdot z27.s, z15.b, z2.b[3]\n"
-                        "sdot z31.s, z15.b, z3.b[3]\n"
-                        "cbz %[blocks], 20f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z28.s, z8.b, z7.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z30.s, z10.b, z7.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "sdot z31.s, z11.b, z7.b[0]\n"
-                        "b.eq 21f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z28.s, z12.b, z7.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z30.s, z14.b, z7.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "sdot z31.s, z15.b, z7.b[1]\n"
-                        "b.eq 22f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z28.s, z8.b, z7.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z30.s, z10.b, z7.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z31.s, z11.b, z7.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 23f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 24f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "24:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "23:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "25:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[3]\n"
-                        "sdot z20.s, z12.b, z5.b[3]\n"
-                        "sdot z24.s, z12.b, z6.b[3]\n"
-                        "sdot z28.s, z12.b, z7.b[3]\n"
-                        "sdot z17.s, z13.b, z4.b[3]\n"
-                        "sdot z21.s, z13.b, z5.b[3]\n"
-                        "sdot z25.s, z13.b, z6.b[3]\n"
-                        "sdot z29.s, z13.b, z7.b[3]\n"
-                        "sdot z18.s, z14.b, z4.b[3]\n"
-                        "sdot z22.s, z14.b, z5.b[3]\n"
-                        "sdot z26.s, z14.b, z6.b[3]\n"
-                        "sdot z30.s, z14.b, z7.b[3]\n"
-                        "sdot z19.s, z15.b, z4.b[3]\n"
-                        "sdot z23.s, z15.b, z5.b[3]\n"
-                        "sdot z27.s, z15.b, z6.b[3]\n"
-                        "sdot z31.s, z15.b, z7.b[3]\n"
-                        "b 7f\n"
-                        "22:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 27f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "27:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "26:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "28:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[2]\n"
-                        "sdot z20.s, z8.b, z5.b[2]\n"
-                        "sdot z24.s, z8.b, z6.b[2]\n"
-                        "sdot z28.s, z8.b, z7.b[2]\n"
-                        "sdot z17.s, z9.b, z4.b[2]\n"
-                        "sdot z21.s, z9.b, z5.b[2]\n"
-                        "sdot z25.s, z9.b, z6.b[2]\n"
-                        "sdot z29.s, z9.b, z7.b[2]\n"
-                        "sdot z18.s, z10.b, z4.b[2]\n"
-                        "sdot z22.s, z10.b, z5.b[2]\n"
-                        "sdot z26.s, z10.b, z6.b[2]\n"
-                        "sdot z30.s, z10.b, z7.b[2]\n"
-                        "sdot z19.s, z11.b, z4.b[2]\n"
-                        "sdot z23.s, z11.b, z5.b[2]\n"
-                        "sdot z27.s, z11.b, z6.b[2]\n"
-                        "sdot z31.s, z11.b, z7.b[2]\n"
-                        "b 7f\n"
-                        "21:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 30f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "30:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "29:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "31:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "sdot z16.s, z12.b, z4.b[1]\n"
-                        "sdot z20.s, z12.b, z5.b[1]\n"
-                        "sdot z24.s, z12.b, z6.b[1]\n"
-                        "sdot z28.s, z12.b, z7.b[1]\n"
-                        "sdot z17.s, z13.b, z4.b[1]\n"
-                        "sdot z21.s, z13.b, z5.b[1]\n"
-                        "sdot z25.s, z13.b, z6.b[1]\n"
-                        "sdot z29.s, z13.b, z7.b[1]\n"
-                        "sdot z18.s, z14.b, z4.b[1]\n"
-                        "sdot z22.s, z14.b, z5.b[1]\n"
-                        "sdot z26.s, z14.b, z6.b[1]\n"
-                        "sdot z30.s, z14.b, z7.b[1]\n"
-                        "sdot z19.s, z15.b, z4.b[1]\n"
-                        "sdot z23.s, z15.b, z5.b[1]\n"
-                        "sdot z27.s, z15.b, z6.b[1]\n"
-                        "sdot z31.s, z15.b, z7.b[1]\n"
-                        "b 7f\n"
-                        "20:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 33f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "33:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "32:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "34:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "sdot z16.s, z8.b, z4.b[0]\n"
-                        "sdot z20.s, z8.b, z5.b[0]\n"
-                        "sdot z24.s, z8.b, z6.b[0]\n"
-                        "sdot z28.s, z8.b, z7.b[0]\n"
-                        "sdot z17.s, z9.b, z4.b[0]\n"
-                        "sdot z21.s, z9.b, z5.b[0]\n"
-                        "sdot z25.s, z9.b, z6.b[0]\n"
-                        "sdot z29.s, z9.b, z7.b[0]\n"
-                        "sdot z18.s, z10.b, z4.b[0]\n"
-                        "sdot z22.s, z10.b, z5.b[0]\n"
-                        "sdot z26.s, z10.b, z6.b[0]\n"
-                        "sdot z30.s, z10.b, z7.b[0]\n"
-                        "sdot z19.s, z11.b, z4.b[0]\n"
-                        "sdot z23.s, z11.b, z5.b[0]\n"
-                        "sdot z27.s, z11.b, z6.b[0]\n"
-                        "sdot z31.s, z11.b, z7.b[0]\n"
-                        "7:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
deleted file mode 100644
index 33e3ac6..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4.hpp
+++ /dev/null

@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include <cstdint>
-
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_native_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-
-class native_u8u32_dot_4VLx4
-{
-public:
-    typedef uint8_t operand_type;
-    typedef uint32_t result_type;
-
-    typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, int ldb, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-
-    /* Kernel blocking parameters */
-    static constexpr unsigned int out_height()
-    {
-        return 4;
-    }
-
-    static unsigned int out_width()
-    {
-        return get_vector_length<uint32_t>() * 4;
-    }
-
-    static constexpr unsigned int k_unroll()
-    {
-        return 4;
-    }
-
-    static constexpr bool supports_append()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_bias()
-    {
-        return false;
-    }
-
-    static constexpr bool supports_activation()
-    {
-        return false;
-    }
-
-
-
-    // Default to the generic kernel
-    kern_type kernel=sve_native_u8u32_dot_4VLx4;
-
-    native_u8u32_dot_4VLx4(const CPUInfo *ci) { UNUSED(ci); }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
deleted file mode 100644
index 994d608..0000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp
+++ /dev/null

@@ -1,4479 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_native_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, int ldb, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool append) {
-    const long loops_count = ((K + 16) / 32) - 1;
-    K -= loops_count * 32;
-    const long regs_count = (K / 16) - 1;
-    K -= (regs_count + 1) * 16;
-    const long leftovers = K;
-    const long blocks_count = K / 4;
-    const long odds_count = K - (blocks_count * 4);
-
-    for (int y=0; y<M; y+=4) {
-        const uint8_t * const a_ptr0_base = A + (y * lda);
-        const unsigned long ldab = lda * sizeof(uint8_t);
-
-        uint32_t *c_ptr0 = C + (y * ldc);
-
-        for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
-            const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
-            long loops = loops_count;
-            long regs = regs_count;
-            long temp = 0;
-            long blocks = blocks_count;
-            long odds = odds_count;
-            const uint8_t *a_ptr0 = a_ptr0_base;
-            const uint8_t *b_ptr0 = B + x0;
-            const uint8_t *b_ptr1 = b_ptr0 + ldb;
-            const uint8_t *b_ptr2 = b_ptr1 + ldb;
-            const uint8_t *b_ptr3 = b_ptr2 + ldb;
-            long ldbb = ldb * sizeof(uint8_t) * 4;
-            const unsigned long ldcb = ldc * sizeof(uint32_t);
-
-            switch(M-y) {
-                case 1:
-                    __asm __volatile (
-                        "mov z16.s, #0\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "mov z17.s, #0\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "mov z18.s, #0\n"
-                        "whilelt p4.b, %[temp], %[width]\n"
-                        "mov z19.s, #0\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ptrue p7.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "cbz %[regs], 3f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 8f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "9:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "8:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "10:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "12:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "11:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "13:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 15f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "15:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "14:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "16:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 18f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "18:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "17:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "19:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "cbz %[blocks], 20f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "b.eq 21f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "b.eq 22f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 23f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 24f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "24:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "23:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "25:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "b 7f\n"
-                        "22:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 27f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "27:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "26:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "28:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "b 7f\n"
-                        "21:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 30f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "30:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "29:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "31:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "b 7f\n"
-                        "20:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 33f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "33:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "32:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "34:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "7:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
-                    );
-                    break;
-                case 2:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "c_ptr1 .req X1\n"
-                        "mov z16.s, #0\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "mov z17.s, #0\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "mov z18.s, #0\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "mov z19.s, #0\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "mov z20.s, #0\n"
-                        "whilelt p4.b, %[temp], %[width]\n"
-                        "mov z21.s, #0\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z22.s, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "mov z23.s, #0\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "ptrue p7.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "cbz %[regs], 3f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 8f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "9:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "8:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "10:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "12:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "11:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "13:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 15f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "15:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "14:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "16:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 18f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "18:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "17:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "19:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "cbz %[blocks], 20f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "b.eq 21f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "b.eq 22f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 23f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 24f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "24:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "23:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "25:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "b 7f\n"
-                        "22:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 27f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "27:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "26:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "28:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "b 7f\n"
-                        "21:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 30f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "30:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "29:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "31:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "b 7f\n"
-                        "20:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 33f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "33:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "32:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "34:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "7:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq c_ptr1\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
-                    );
-                    break;
-                case 3:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "c_ptr1 .req X2\n"
-                        "c_ptr2 .req X3\n"
-                        "mov z16.s, #0\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "mov z17.s, #0\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov z18.s, #0\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "mov z19.s, #0\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "mov z20.s, #0\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "mov z21.s, #0\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "mov z22.s, #0\n"
-                        "whilelt p4.b, %[temp], %[width]\n"
-                        "mov z23.s, #0\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z24.s, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "mov z25.s, #0\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "mov z26.s, #0\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "mov z27.s, #0\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "ptrue p7.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "cbz %[regs], 3f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 8f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "9:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "8:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "10:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "12:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "11:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "13:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 15f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "15:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "14:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "16:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 18f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "18:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "17:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "19:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "cbz %[blocks], 20f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "b.eq 21f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "b.eq 22f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 23f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 24f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "24:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "23:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "25:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "b 7f\n"
-                        "22:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 27f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "27:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "26:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "28:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "b 7f\n"
-                        "21:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 30f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "30:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "29:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "31:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "b 7f\n"
-                        "20:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 33f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "33:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "32:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "34:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "7:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
-                    );
-                    break;
-                default:
-                case 4:
-                    __asm __volatile (
-                        "a_ptr1 .req X0\n"
-                        "a_ptr2 .req X1\n"
-                        "a_ptr3 .req X2\n"
-                        "c_ptr1 .req X3\n"
-                        "c_ptr2 .req X4\n"
-                        "c_ptr3 .req X5\n"
-                        "mov z16.s, #0\n"
-                        "add a_ptr1, %[a_ptr0], %[lda]\n"
-                        "mov z17.s, #0\n"
-                        "add a_ptr2, a_ptr1, %[lda]\n"
-                        "mov z18.s, #0\n"
-                        "add a_ptr3, a_ptr2, %[lda]\n"
-                        "mov z19.s, #0\n"
-                        "add c_ptr1, %[c_ptr0], %[ldc]\n"
-                        "mov z20.s, #0\n"
-                        "add c_ptr2, c_ptr1, %[ldc]\n"
-                        "mov z21.s, #0\n"
-                        "add c_ptr3, c_ptr2, %[ldc]\n"
-                        "mov z22.s, #0\n"
-                        "whilelt p6.b, %[temp], %[leftovers]\n"
-                        "mov z23.s, #0\n"
-                        "whilelt p0.s, %[temp], %[width]\n"
-                        "mov z24.s, #0\n"
-                        "whilelt p4.b, %[temp], %[width]\n"
-                        "mov z25.s, #0\n"
-                        "incw %[temp], all, mul #1\n"
-                        "mov z26.s, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "mov z27.s, #0\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "mov z28.s, #0\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "mov z29.s, #0\n"
-                        "whilelt p1.s, %[temp], %[width]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "incw %[temp], all, mul #1\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "mov z30.s, #0\n"
-                        "ptrue p7.b\n"
-                        "mov z31.s, #0\n"
-                        "whilelt p2.s, %[temp], %[width]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2]\n"
-                        "incw %[temp], all, mul #1\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3]\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x10\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "whilelt p3.s, %[temp], %[width]\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "add a_ptr1, a_ptr1, #0x10\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "add a_ptr2, a_ptr2, #0x10\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "add a_ptr3, a_ptr3, #0x10\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "cbz %[loops], 1f\n"
-                        "2:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "subs %[loops], %[loops], #0x1\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "add %[a_ptr0], %[a_ptr0], #0x20\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "add a_ptr1, a_ptr1, #0x20\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "add a_ptr2, a_ptr2, #0x20\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "add a_ptr3, a_ptr3, #0x20\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z28.s, z8.b, z7.b[0]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "udot z31.s, z11.b, z7.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z28.s, z12.b, z7.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "udot z31.s, z15.b, z7.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z28.s, z8.b, z7.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z30.s, z10.b, z7.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z31.s, z11.b, z7.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z28.s, z12.b, z7.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z30.s, z14.b, z7.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "udot z31.s, z15.b, z7.b[3]\n"
-                        "b.ne 2b\n"
-                        "1:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "cbz %[regs], 3f\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p7/z, [a_ptr1]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z6.b, p7/z, [a_ptr2]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z7.b, p7/z, [a_ptr3]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #2\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "addvl a_ptr1, a_ptr1, #2\n"
-                        "udot z28.s, z8.b, z7.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #2\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "addvl a_ptr3, a_ptr3, #2\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z30.s, z10.b, z7.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "udot z31.s, z11.b, z7.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z28.s, z12.b, z7.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z30.s, z14.b, z7.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "udot z31.s, z15.b, z7.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z28.s, z8.b, z7.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z30.s, z10.b, z7.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z31.s, z11.b, z7.b[2]\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z28.s, z12.b, z7.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z30.s, z14.b, z7.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "udot z31.s, z15.b, z7.b[3]\n"
-                        "cbz %[blocks], 4f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "b.eq 5f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "b.eq 6f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 8f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 9f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "9:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 10f\n"
-                        "8:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "10:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "b 7f\n"
-                        "6:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 11f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 12f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "12:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 13f\n"
-                        "11:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "13:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "b 7f\n"
-                        "5:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 14f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 15f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "15:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 16f\n"
-                        "14:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "16:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "b 7f\n"
-                        "4:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 17f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 18f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "18:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 19f\n"
-                        "17:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "19:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "b 7f\n"
-                        "3:\n"
-                        "udot z16.s, z8.b, z0.b[0]\n"
-                        "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
-                        "udot z20.s, z8.b, z1.b[0]\n"
-                        "ld1rqb z5.b, p6/z, [a_ptr1]\n"
-                        "udot z24.s, z8.b, z2.b[0]\n"
-                        "ld1rqb z6.b, p6/z, [a_ptr2]\n"
-                        "udot z28.s, z8.b, z3.b[0]\n"
-                        "ld1rqb z7.b, p6/z, [a_ptr3]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z17.s, z9.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "addvl %[a_ptr0], %[a_ptr0], #1\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "addvl a_ptr1, a_ptr1, #1\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z21.s, z9.b, z1.b[0]\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "udot z25.s, z9.b, z2.b[0]\n"
-                        "addvl a_ptr2, a_ptr2, #1\n"
-                        "udot z29.s, z9.b, z3.b[0]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z10.b, z0.b[0]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "udot z22.s, z10.b, z1.b[0]\n"
-                        "addvl a_ptr3, a_ptr3, #1\n"
-                        "udot z26.s, z10.b, z2.b[0]\n"
-                        "udot z30.s, z10.b, z3.b[0]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z11.b, z0.b[0]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "udot z23.s, z11.b, z1.b[0]\n"
-                        "udot z27.s, z11.b, z2.b[0]\n"
-                        "udot z31.s, z11.b, z3.b[0]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z12.b, z0.b[1]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "udot z20.s, z12.b, z1.b[1]\n"
-                        "udot z24.s, z12.b, z2.b[1]\n"
-                        "udot z28.s, z12.b, z3.b[1]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "udot z17.s, z13.b, z0.b[1]\n"
-                        "udot z21.s, z13.b, z1.b[1]\n"
-                        "udot z25.s, z13.b, z2.b[1]\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "udot z29.s, z13.b, z3.b[1]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "udot z18.s, z14.b, z0.b[1]\n"
-                        "udot z22.s, z14.b, z1.b[1]\n"
-                        "udot z26.s, z14.b, z2.b[1]\n"
-                        "udot z30.s, z14.b, z3.b[1]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "udot z19.s, z15.b, z0.b[1]\n"
-                        "udot z23.s, z15.b, z1.b[1]\n"
-                        "udot z27.s, z15.b, z2.b[1]\n"
-                        "udot z31.s, z15.b, z3.b[1]\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "udot z16.s, z8.b, z0.b[2]\n"
-                        "udot z20.s, z8.b, z1.b[2]\n"
-                        "udot z24.s, z8.b, z2.b[2]\n"
-                        "udot z28.s, z8.b, z3.b[2]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "udot z17.s, z9.b, z0.b[2]\n"
-                        "udot z21.s, z9.b, z1.b[2]\n"
-                        "udot z25.s, z9.b, z2.b[2]\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z29.s, z9.b, z3.b[2]\n"
-                        "udot z18.s, z10.b, z0.b[2]\n"
-                        "udot z22.s, z10.b, z1.b[2]\n"
-                        "udot z26.s, z10.b, z2.b[2]\n"
-                        "udot z30.s, z10.b, z3.b[2]\n"
-                        "udot z19.s, z11.b, z0.b[2]\n"
-                        "udot z23.s, z11.b, z1.b[2]\n"
-                        "udot z27.s, z11.b, z2.b[2]\n"
-                        "udot z31.s, z11.b, z3.b[2]\n"
-                        "udot z16.s, z12.b, z0.b[3]\n"
-                        "udot z20.s, z12.b, z1.b[3]\n"
-                        "udot z24.s, z12.b, z2.b[3]\n"
-                        "udot z28.s, z12.b, z3.b[3]\n"
-                        "udot z17.s, z13.b, z0.b[3]\n"
-                        "udot z21.s, z13.b, z1.b[3]\n"
-                        "udot z25.s, z13.b, z2.b[3]\n"
-                        "udot z29.s, z13.b, z3.b[3]\n"
-                        "udot z18.s, z14.b, z0.b[3]\n"
-                        "udot z22.s, z14.b, z1.b[3]\n"
-                        "udot z26.s, z14.b, z2.b[3]\n"
-                        "udot z30.s, z14.b, z3.b[3]\n"
-                        "udot z19.s, z15.b, z0.b[3]\n"
-                        "udot z23.s, z15.b, z1.b[3]\n"
-                        "udot z27.s, z15.b, z2.b[3]\n"
-                        "udot z31.s, z15.b, z3.b[3]\n"
-                        "cbz %[blocks], 20f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z28.s, z8.b, z7.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z30.s, z10.b, z7.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "udot z31.s, z11.b, z7.b[0]\n"
-                        "b.eq 21f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "subs %[blocks], %[blocks], #0x1\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z28.s, z12.b, z7.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z30.s, z14.b, z7.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "udot z31.s, z15.b, z7.b[1]\n"
-                        "b.eq 22f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "add %[b_ptr3], %[b_ptr3], %[ldb]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr3]]\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z28.s, z8.b, z7.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z30.s, z10.b, z7.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z31.s, z11.b, z7.b[2]\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 23f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 24f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "24:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 25f\n"
-                        "23:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "25:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[3]\n"
-                        "udot z20.s, z12.b, z5.b[3]\n"
-                        "udot z24.s, z12.b, z6.b[3]\n"
-                        "udot z28.s, z12.b, z7.b[3]\n"
-                        "udot z17.s, z13.b, z4.b[3]\n"
-                        "udot z21.s, z13.b, z5.b[3]\n"
-                        "udot z25.s, z13.b, z6.b[3]\n"
-                        "udot z29.s, z13.b, z7.b[3]\n"
-                        "udot z18.s, z14.b, z4.b[3]\n"
-                        "udot z22.s, z14.b, z5.b[3]\n"
-                        "udot z26.s, z14.b, z6.b[3]\n"
-                        "udot z30.s, z14.b, z7.b[3]\n"
-                        "udot z19.s, z15.b, z4.b[3]\n"
-                        "udot z23.s, z15.b, z5.b[3]\n"
-                        "udot z27.s, z15.b, z6.b[3]\n"
-                        "udot z31.s, z15.b, z7.b[3]\n"
-                        "b 7f\n"
-                        "22:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 26f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 27f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "27:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 28f\n"
-                        "26:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "28:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[2]\n"
-                        "udot z20.s, z8.b, z5.b[2]\n"
-                        "udot z24.s, z8.b, z6.b[2]\n"
-                        "udot z28.s, z8.b, z7.b[2]\n"
-                        "udot z17.s, z9.b, z4.b[2]\n"
-                        "udot z21.s, z9.b, z5.b[2]\n"
-                        "udot z25.s, z9.b, z6.b[2]\n"
-                        "udot z29.s, z9.b, z7.b[2]\n"
-                        "udot z18.s, z10.b, z4.b[2]\n"
-                        "udot z22.s, z10.b, z5.b[2]\n"
-                        "udot z26.s, z10.b, z6.b[2]\n"
-                        "udot z30.s, z10.b, z7.b[2]\n"
-                        "udot z19.s, z11.b, z4.b[2]\n"
-                        "udot z23.s, z11.b, z5.b[2]\n"
-                        "udot z27.s, z11.b, z6.b[2]\n"
-                        "udot z31.s, z11.b, z7.b[2]\n"
-                        "b 7f\n"
-                        "21:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 29f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 30f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z13.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "30:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z14.b, p4/z, [%[b_ptr1]]\n"
-                        "b 31f\n"
-                        "29:\n"
-                        "mov z13.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z14.b, #0\n"
-                        "ld1b z12.b, p4/z, [%[b_ptr0]]\n"
-                        "31:\n"
-                        "zip2 z15.b, z12.b, z13.b\n"
-                        "zip1 z13.b, z12.b, z13.b\n"
-                        "mov z12.b, #0\n"
-                        "zip2 z8.b, z14.b, z12.b\n"
-                        "zip1 z14.b, z14.b, z12.b\n"
-                        "zip1 z12.b, z13.b, z14.b\n"
-                        "zip2 z13.b, z13.b, z14.b\n"
-                        "zip1 z14.b, z15.b, z8.b\n"
-                        "zip2 z15.b, z15.b, z8.b\n"
-                        "udot z16.s, z12.b, z4.b[1]\n"
-                        "udot z20.s, z12.b, z5.b[1]\n"
-                        "udot z24.s, z12.b, z6.b[1]\n"
-                        "udot z28.s, z12.b, z7.b[1]\n"
-                        "udot z17.s, z13.b, z4.b[1]\n"
-                        "udot z21.s, z13.b, z5.b[1]\n"
-                        "udot z25.s, z13.b, z6.b[1]\n"
-                        "udot z29.s, z13.b, z7.b[1]\n"
-                        "udot z18.s, z14.b, z4.b[1]\n"
-                        "udot z22.s, z14.b, z5.b[1]\n"
-                        "udot z26.s, z14.b, z6.b[1]\n"
-                        "udot z30.s, z14.b, z7.b[1]\n"
-                        "udot z19.s, z15.b, z4.b[1]\n"
-                        "udot z23.s, z15.b, z5.b[1]\n"
-                        "udot z27.s, z15.b, z6.b[1]\n"
-                        "udot z31.s, z15.b, z7.b[1]\n"
-                        "b 7f\n"
-                        "20:\n"
-                        "cbz %[odds], 7f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 32f\n"
-                        "subs %[odds], %[odds], #0x1\n"
-                        "b.eq 33f\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr2], %[b_ptr2], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z9.b, p4/z, [%[b_ptr2]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "33:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "add %[b_ptr1], %[b_ptr1], %[ldb]\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "ld1b z10.b, p4/z, [%[b_ptr1]]\n"
-                        "b 34f\n"
-                        "32:\n"
-                        "mov z9.b, #0\n"
-                        "add %[b_ptr0], %[b_ptr0], %[ldb]\n"
-                        "mov z10.b, #0\n"
-                        "ld1b z8.b, p4/z, [%[b_ptr0]]\n"
-                        "34:\n"
-                        "zip2 z11.b, z8.b, z9.b\n"
-                        "zip1 z9.b, z8.b, z9.b\n"
-                        "mov z8.b, #0\n"
-                        "zip2 z12.b, z10.b, z8.b\n"
-                        "zip1 z10.b, z10.b, z8.b\n"
-                        "zip1 z8.b, z9.b, z10.b\n"
-                        "zip2 z9.b, z9.b, z10.b\n"
-                        "zip1 z10.b, z11.b, z12.b\n"
-                        "zip2 z11.b, z11.b, z12.b\n"
-                        "udot z16.s, z8.b, z4.b[0]\n"
-                        "udot z20.s, z8.b, z5.b[0]\n"
-                        "udot z24.s, z8.b, z6.b[0]\n"
-                        "udot z28.s, z8.b, z7.b[0]\n"
-                        "udot z17.s, z9.b, z4.b[0]\n"
-                        "udot z21.s, z9.b, z5.b[0]\n"
-                        "udot z25.s, z9.b, z6.b[0]\n"
-                        "udot z29.s, z9.b, z7.b[0]\n"
-                        "udot z18.s, z10.b, z4.b[0]\n"
-                        "udot z22.s, z10.b, z5.b[0]\n"
-                        "udot z26.s, z10.b, z6.b[0]\n"
-                        "udot z30.s, z10.b, z7.b[0]\n"
-                        "udot z19.s, z11.b, z4.b[0]\n"
-                        "udot z23.s, z11.b, z5.b[0]\n"
-                        "udot z27.s, z11.b, z6.b[0]\n"
-                        "udot z31.s, z11.b, z7.b[0]\n"
-                        "7:\n"
-                        "st1w z16.s, p0, [%[c_ptr0]]\n"
-                        "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
-                        "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
-                        "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
-                        "addvl %[c_ptr0], %[c_ptr0], #4\n"
-                        "st1w z20.s, p0, [c_ptr1]\n"
-                        "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
-                        "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
-                        "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
-                        "st1w z24.s, p0, [c_ptr2]\n"
-                        "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
-                        "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
-                        "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
-                        "st1w z28.s, p0, [c_ptr3]\n"
-                        "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
-                        "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
-                        "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
-                        ".unreq a_ptr1\n"
-                        ".unreq a_ptr2\n"
-                        ".unreq a_ptr3\n"
-                        ".unreq c_ptr1\n"
-                        ".unreq c_ptr2\n"
-                        ".unreq c_ptr3\n"
-                        : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [b_ptr1] "+r" (b_ptr1), [b_ptr2] "+r" (b_ptr2), [b_ptr3] "+r" (b_ptr3), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks), [odds] "+r" (odds)
-                        : [width] "r" (width), [append] "r" (static_cast<uint64_t>(append)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers), [ldb] "r" (ldbb)
-                        : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
-                    );
-                    break;
-            }
-
-        }
-    }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp
index 191262b..b555066 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,7 @@
         return 1;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return false;
     }
@@ -77,7 +77,9 @@
     // Default to the generic kernel
     kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx8;
 
-    smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *ci) { UNUSED(ci); }
+    smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *)
+    {
+
     }
 };
 

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp
index 3ad609b..5501688 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp
index fc18cbd..eef1e4c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return false;
     }
@@ -77,7 +77,10 @@
     // Default to the generic kernel
     kern_type kernel=sve_smallK_hybrid_s8s32_dot_1VLx8;
 
-    smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *ci) { UNUSED(ci); }
+    smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp
index d9813a5..e2fbdcb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp
index 51d3e73..70a0b12 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,7 @@
         return 4;
     }
 
-    static constexpr bool supports_append()
+    static constexpr bool supports_accumulate()
     {
         return false;
     }
@@ -77,7 +77,10 @@
     // Default to the generic kernel
     kern_type kernel=sve_smallK_hybrid_u8u32_dot_1VLx8;
 
-    smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *ci) { UNUSED(ci); }
+    smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *)
+    {
+
+    }
 };
 
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp
index 6eed457..1d0b84e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
index 83d6bcc..563c31d 100644
--- a/src/core/NEON/kernels/arm_gemm/mergeresults.cpp
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/mergeresults.hpp b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
index 1aedd23..589d87e 100644
--- a/src/core/NEON/kernels/arm_gemm/mergeresults.hpp
+++ b/src/core/NEON/kernels/arm_gemm/mergeresults.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
index 16bdbb5..bea455c 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a32_merge_float_8x6.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
index 5d8eae4..a81d450 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp16_24x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1130,11 +1130,7 @@
             }
             else
             {
-                const __fp16 *biasptr = nullbias;
-                if (bias)
-                {
-                    biasptr = bias + i;
-                }
+                const __fp16 *biasptr = bias ? bias + i : nullbias;
 
                 switch(height)
                 {

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp
index 088353e..284f2dc 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_fp32_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,8 +30,8 @@
 {
     const float *inptr = in;
     float nullbias[12];
-    float minval = - std::numeric_limits<float>::infinity();
-    float maxval =   std::numeric_limits<float>::infinity();
+    float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+    float maxval =   static_cast<float>(std::numeric_limits<float>::infinity());
 
     switch(act.type)
     {
@@ -1106,11 +1106,7 @@
             }
             else
             {
-                const float *biasptr = nullbias;
-                if (bias)
-                {
-                    biasptr = bias + i;
-                }
+                const float *biasptr = bias ? bias + i : nullbias;
 
                 switch(height)
                 {

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp
index 2e45d8b..fcf08e4 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,8 @@
 #ifdef __aarch64__
 
 template<>
-void MergeResults<12, 8, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append)
+void MergeResults<12, 8, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append)
 {
-    UNUSED(act);
-
     const int32_t *inptr = in;
     int32_t nullbias[12];
 
@@ -862,11 +860,7 @@
             }
             else
             {
-                const int32_t *biasptr = nullbias;
-                if (bias)
-                {
-                    biasptr = bias + i;
-                }
+                const int32_t *biasptr = bias ? bias + i : nullbias;
 
                 switch(height)
                 {

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp
index 6d869af..88eaa5f 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_s32_4x4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,8 @@
 #ifdef __aarch64__
 
 template<>
-void MergeResults<4, 4, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append)
+void MergeResults<4, 4, false>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append)
 {
-    UNUSED(act);
-
     const int32_t *inptr = in;
     int32_t nullbias[4];
 
@@ -240,11 +238,7 @@
             }
             else
             {
-                const int32_t *biasptr = nullbias;
-                if (bias)
-                {
-                    biasptr = bias + i;
-                }
+                const int32_t *biasptr = bias ? bias + i : nullbias;
 
                 switch(height)
                 {

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp
index 0a05944..adc02f1 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_12x8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,8 @@
 #ifdef __aarch64__
 
 template<>
-void MergeResults<12, 8, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation act, bool append)
+void MergeResults<12, 8, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append)
 {
-    UNUSED(act);
-
     const uint32_t *inptr = in;
     uint32_t nullbias[12];
 
@@ -862,11 +860,7 @@
             }
             else
             {
-                const uint32_t *biasptr = nullbias;
-                if (bias)
-                {
-                    biasptr = bias + i;
-                }
+                const uint32_t *biasptr = bias ? bias + i : nullbias;
 
                 switch(height)
                 {

diff --git a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp
index efb17dc..32e1eeb 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/a64_merge_u32_4x4.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,8 @@
 #ifdef __aarch64__
 
 template<>
-void MergeResults<4, 4, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation act, bool append)
+void MergeResults<4, 4, false>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append)
 {
-    UNUSED(act);
-
     const uint32_t *inptr = in;
     uint32_t nullbias[4];
 
@@ -240,11 +238,7 @@
             }
             else
             {
-                const uint32_t *biasptr = nullbias;
-                if (bias)
-                {
-                    biasptr = bias + i;
-                }
+                const uint32_t *biasptr = bias ? bias + i : nullbias;
 
                 switch(height)
                 {

diff --git a/src/core/NEON/kernels/arm_gemm/merges/list.hpp b/src/core/NEON/kernels/arm_gemm/merges/list.hpp
index 4edb497..825c2fd 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/list.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
index a44ef55..cf1d103 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp16_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1010,11 +1010,7 @@
             }
             else
             {
-                const __fp16 *biasptr = nullbias;
-                if (bias)
-                {
-                    biasptr = bias + i;
-                }
+                const __fp16 *biasptr = bias ? bias + i : nullbias;
 
                 switch(height)
                 {

diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
index bb073e4..b0d10c0 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_fp32_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1010,11 +1010,7 @@
             }
             else
             {
-                const float *biasptr = nullbias;
-                if (bias)
-                {
-                    biasptr = bias + i;
-                }
+                const float *biasptr = bias ? bias + i : nullbias;
 
                 switch(height)
                 {

diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
index d4c5073..34b6fe3 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_s32_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,13 +26,12 @@
 #ifdef __ARM_FEATURE_SVE
 
 template<>
-void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation act, bool append)
+void MergeResults<3, 8, true>(int32_t *out, const int32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const int32_t *bias, Activation , bool append)
 {
-    UNUSED(act);
-
     const int32_t *inptr = in;
     int32_t nullbias[192];
 
+
     if (!append && !bias)
     {
         memset(nullbias, 0, (3 * get_vector_length<int32_t>() * sizeof(int32_t)));
@@ -765,11 +764,7 @@
             }
             else
             {
-                const int32_t *biasptr = nullbias;
-                if (bias)
-                {
-                    biasptr = bias + i;
-                }
+                const int32_t *biasptr = bias ? bias + i : nullbias;
 
                 switch(height)
                 {

diff --git a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
index f2a28fa..c4b2bb5 100644
--- a/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/merges/sve_merge_u32_3VLx8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,13 +26,12 @@
 #ifdef __ARM_FEATURE_SVE
 
 template<>
-void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation act, bool append)
+void MergeResults<3, 8, true>(uint32_t *out, const uint32_t *in, const int ldout, const int y0, const int ymax, const int x0, const int xmax, const uint32_t *bias, Activation , bool append)
 {
-    UNUSED(act);
-
     const uint32_t *inptr = in;
     uint32_t nullbias[192];
 
+
     if (!append && !bias)
     {
         memset(nullbias, 0, (3 * get_vector_length<uint32_t>() * sizeof(uint32_t)));
@@ -765,11 +764,7 @@
             }
             else
             {
-                const uint32_t *biasptr = nullbias;
-                if (bias)
-                {
-                    biasptr = bias + i;
-                }
+                const uint32_t *biasptr = bias ? bias + i : nullbias;
 
                 switch(height)
                 {

diff --git a/src/core/NEON/kernels/arm_gemm/misc.cpp b/src/core/NEON/kernels/arm_gemm/misc.cpp
index 6758a88..229e6b5 100644
--- a/src/core/NEON/kernels/arm_gemm/misc.cpp
+++ b/src/core/NEON/kernels/arm_gemm/misc.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
 #ifndef NO_MULTI_THREADING
 #include <mutex>
 #endif

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp
similarity index 63%
copy from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
copy to src/core/NEON/kernels/arm_gemm/performance_parameters.hpp
index 36f84d8..059ab5f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/src/core/NEON/kernels/arm_gemm/performance_parameters.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,35 +23,15 @@
  */
 #pragma once
 
-#ifdef __aarch64__
-
 namespace arm_gemm {
 
-// Actual kernel implementations
-void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
+struct PerformanceParameters {
+    float	kernel_macs_cycle;
+    float	prepare_bytes_cycle = 0.0f;
+    float	merge_bytes_cycle   = 0.0f;
 
-// Transposed SGEMV strategy class.
-class sgemv_trans {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
-
-    /* Kernel blocking parameters */
-    static unsigned int out_width() {
-        return 96;
-    }
-
-    static unsigned int k_unroll() {
-        return 1;
-    }
-
-    kern_type kernel=a64_sgemv_trans;
-
-    sgemv_trans(const CPUInfo *ci) { UNUSED(ci); }
+    PerformanceParameters(float k) : kernel_macs_cycle(k) { }
+    PerformanceParameters(float k, float p, float m) : kernel_macs_cycle(k), prepare_bytes_cycle(p), merge_bytes_cycle(m) { }
 };
 
 } // namespace arm_gemm
-
-#endif // __aarch64__

diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
index 18f030f..eec842d 100644
--- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,16 +61,9 @@
     }
 
     /* Local working space: We need space for the subgemm output (above) and
-     * the row sums.  If the GEMM is not pretransposed we need to store the
-     * column sums here too.  */
+     * the row sums.  */
     size_t local_working_size() const {
-        size_t sz = subgemm_output_size() + row_sum_size();
-
-        if (_args._pretransposed_hint) {
-            return sz;
-        }
-
-        return sz + col_sum_size();
+        return subgemm_output_size() + row_sum_size();
     }
 
     void set_child_arrays() {
@@ -90,15 +83,6 @@
         }
     }
 
-    void col_sums_runtime(unsigned int threadid) {
-        unsigned int first_col = (threadid * _args._Nsize) / _args._maxthreads;
-        unsigned int last_col = ((threadid + 1) * _args._Nsize) / _args._maxthreads;
-
-        for (unsigned int multi=0; multi<_args._nmulti; multi++) {
-            compute_col_sums(_params, (last_col - first_col), _args._Ksize, this->_Bptr + (multi * this->_B_multi_stride) + first_col, this->_ldb, _col_sums + (multi * _args._Nsize) + first_col, _args._Ksize, multi, first_col);
-        }
-    }
-
     void requantize_runtime(unsigned int threadid) {
         unsigned int first_row = (threadid * _args._Msize) / _args._maxthreads;
         unsigned int last_row = ((threadid+1) * _args._Msize) / _args._maxthreads;
@@ -115,7 +99,7 @@
                                     _args._Nsize,
                                     this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (first_row * this->_ldc), this->_ldc,
                                     _row_sums + (multi * _args._nbatches * _args._Msize) + (batch * _args._Msize) + first_row,
-                                    _col_sums + (multi * _args._Nsize));
+                                    _col_sums + (multi * _args._Nsize), 0);
             }
         }
     }
@@ -126,16 +110,12 @@
     QuantizeWrapper operator=(const QuantizeWrapper &) = delete;
 
     QuantizeWrapper(const GemmArgs &args, const Requantize32 &qp) : _params(qp), _args(args), _barrier(args._maxthreads) {
-        GemmArgs newargs = GemmArgs(args._ci, args._Msize, args._Nsize, args._Ksize, args._nbatches, args._nmulti, args._trA, args._trB, Activation(), args._maxthreads, args._pretransposed_hint, nullptr);
+        GemmArgs newargs = GemmArgs(args._ci, args._Msize, args._Nsize, args._Ksize, args._nbatches, args._nmulti, Activation(), args._maxthreads, nullptr);
         _subgemm = gemm<To, Tgemm>(newargs);
 
         if (_subgemm == nullptr) {
             return;
         }
-
-        if (!_subgemm->B_is_pretransposed()) {
-            _args._pretransposed_hint = false;
-        }
     }
 
     void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
@@ -149,7 +129,7 @@
     }
 
     ndrange_t get_window_size() const override {
-        return _subgemm->get_window_size();
+        return { _subgemm->get_window_size() };
     }
 
     void set_nthreads(int nthreads) override {
@@ -158,12 +138,8 @@
         _args._maxthreads = nthreads;
     }
 
-    // Execute
-    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+    void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override {
         _subgemm->execute(work_range, thread_locator, threadid);
-        if (!_args._pretransposed_hint) {
-            col_sums_runtime(threadid);
-        }
 
         _barrier.arrive_and_wait();
 
@@ -178,7 +154,7 @@
 
     // ptr
     // V
-    // | subgemm output | row_sums | col_sums (if not pretransposed | subgemm working space |
+    // | subgemm output | row_sums | subgemm working space |
     void set_working_space(void *space) override {
         uintptr_t space_int = reinterpret_cast<uintptr_t>(space);
 
@@ -186,16 +162,13 @@
         _subgemm->set_working_space(reinterpret_cast<void *>(space_int + local_working_size()));
 
         _row_sums = reinterpret_cast<int32_t *>(space_int + subgemm_output_size());
-        if (!_args._pretransposed_hint) {
-            _col_sums = reinterpret_cast<int32_t *>(space_int + subgemm_output_size() + row_sum_size());
-        }
 
         set_child_arrays();
     }
 
     bool B_is_pretransposed() const override {
         /* We clear this flag if the subgemm isn't pretransposed, so just return its value */
-        return _args._pretransposed_hint;
+        return _subgemm->B_is_pretransposed();
     }
 
     bool B_pretranspose_required() const override {
@@ -203,18 +176,10 @@
     }
 
     size_t get_B_pretransposed_array_size() const override {
-        if (_args._pretransposed_hint) {
-            return _subgemm->get_B_pretransposed_array_size() + col_sum_size();
-        }
-
-        return 0;
+        return _subgemm->get_B_pretransposed_array_size() + col_sum_size();
     }
 
     void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
-        if (!_args._pretransposed_hint) {
-            return;
-        }
-
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(buffer);
         _subgemm->pretranspose_B_array(reinterpret_cast<void *>(buffer_int + col_sum_size()), B, ldb, B_multi_stride);
 
@@ -224,10 +189,6 @@
     }
 
     void set_pretransposed_B_data(void *buffer) override {
-        if (!_args._pretransposed_hint) {
-            return;
-        }
-
         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(buffer);
         _subgemm->set_pretransposed_B_data(reinterpret_cast<void *>(buffer_int + col_sum_size()));
         _col_sums = reinterpret_cast<int32_t *>(buffer);

diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp
index 00b42cf..e50dca7 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.cpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp

@@ -24,6 +24,7 @@
 #ifdef __aarch64__
 
 #include "arm_gemm.hpp"
+#include "utils.hpp"
 
 #include <arm_neon.h>
 
@@ -57,7 +58,7 @@
 template<bool do_shift_correction, bool per_channel>
 void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigned int height,
                              const int32_t *input, unsigned int in_stride, int8_t *output, unsigned int out_stride,
-                             const int32_t *row_bias, const int32_t *col_bias) {
+                             const int32_t *row_bias, const int32_t *col_bias, const unsigned int start_col) {
     const int32x4_t v_mul      = vdupq_n_s32(qp.per_layer_mul);
     const int32x4_t v_shift    = vdupq_n_s32(qp.per_layer_shift);
     const int32x4_t v_minval   = vdupq_n_s32(qp.minval);
@@ -76,8 +77,8 @@
         unsigned int odds=(width % 4);
 
         const int32_t *colptr = col_bias;
-        const int32_t *perch_mul_ptr   = qp.per_channel_muls;
-        const int32_t *perch_shift_ptr = qp.per_channel_shifts;
+        const int32_t *perch_mul_ptr   = qp.per_channel_muls + start_col;
+        const int32_t *perch_shift_ptr = qp.per_channel_shifts + start_col;
 
         const int32_t *in_ptr = input + (row * in_stride);
         int8_t *out_ptr = output + (row * out_stride);
@@ -283,7 +284,6 @@
                 v_mul0=v_mul;
                 v_shf0=v_shift;
             }
-
             // Load column pointers
             int32x4_t v_col0 = vld1q_s32(colptr);
             colptr += 4;
@@ -461,33 +461,33 @@
 template<typename Tin, typename Tout>
 void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned int height,
                          const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride,
-                         const int32_t *row_bias, const int32_t *col_bias) {
+                         const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col) {
     if (qp.per_channel_requant) {
         if (qp.minval >= qp.c_offset) {
             requantize_block_32_int<false, true>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
-                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias);
+                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
         } else {
             requantize_block_32_int<true, true>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
-                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias);
+                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
         }
     } else {
         if (qp.minval >= qp.c_offset) {
             requantize_block_32_int<false, false>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
-                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias);
+                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
         } else {
             requantize_block_32_int<true, false>(qp, width, height, reinterpret_cast<const int32_t *>(input), in_stride,
-                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias);
+                             reinterpret_cast<int8_t *>(output), out_stride, row_bias, col_bias, start_col);
         }
     }
 }
 
 template void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned int height,
                          const int32_t *input, unsigned int in_stride, int8_t *output, unsigned int out_stride,
-                         const int32_t *row_bias, const int32_t *col_bias);
+                         const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col);
 
 template void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned int height,
                          const uint32_t *input, unsigned int in_stride, uint8_t *output, unsigned int out_stride,
-                         const int32_t *row_bias, const int32_t *col_bias);
+                         const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col);
 
 /*
  * Routine (and helpers) to compute row sums needed for offset correction.
@@ -604,7 +604,6 @@
              * that the terms can simply be added in the requantize code.
              * */
             switch (rows) {
-                default:
                 case 1:
                     /* If we only have one output, just use ADDV.  Multiply
                      * the offset into all four components separately so it
@@ -646,6 +645,9 @@
 
                     vst1q_s32(row_bias, t0);
                     break;
+
+                default:
+                    UNREACHABLE("Impossible.");
             }
         }
 
@@ -836,7 +838,6 @@
 
                 if (numcols==16) {
                     switch(numrows) {
-                        default:
                         case 1:
                             add_block<1>(input + row * in_stride + col, in_stride, col_bias + col);
                             break;
@@ -852,6 +853,9 @@
                         case 4:
                             add_block<4>(input + row * in_stride + col, in_stride, col_bias + col);
                             break;
+
+                        default:
+                            UNREACHABLE("Impossible.");
                     }
                 } else {
                     for (; col<width; col++) {

diff --git a/src/core/NEON/kernels/arm_gemm/quantized.hpp b/src/core/NEON/kernels/arm_gemm/quantized.hpp
index a91a888..b0e0c3b 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.hpp

@@ -28,7 +28,7 @@
 template<typename Tin, typename Tout>
 void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned int height,
                          const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride,
-                         const int32_t *row_bias, const int32_t *col_bias);
+                         const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col);
 
 template<typename T>
 void compute_row_sums(const Requantize32 &qp, unsigned int width, unsigned int height,

diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
index c3c1e8d..1d3aee7 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,26 +45,18 @@
 public:
     template<typename TIn>
     void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
-                  const int ymax, const int k0, const int kmax, bool transposed) {
-        if (transposed) {
-            Transform<height, block,  true>(out, in, stride, y0, ymax, k0, kmax);
-        } else {
-            Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
-        }
+                  const int ymax, const int k0, const int kmax) const {
+        Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
     }
 
     template<typename TIn>
     void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
-                  const int xmax, const int k0, const int kmax, bool transposed) {
-        if (transposed) {
-            Transform<width, block, false>(out, in, stride, x0, xmax, k0, kmax);
-        } else {
-            Transform<width, block,  true>(out, in, stride, x0, xmax, k0, kmax);
-        }
+                  const int xmax, const int k0, const int kmax) const {
+        Transform<width, block,  true>(out, in, stride, x0, xmax, k0, kmax);
     }
 
     template<typename TOut>
-    void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut *bias, const Activation act, bool append) {
+    void Merge(TOut *out, const TResult *in, int stride, int y0, int ymax, int x0, int xmax, const TOut *bias, const Activation act, bool append) const {
         MergeResults<width, height>(out, in, stride, y0, ymax, x0, xmax, bias, act, append);
     }
 };

diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
index 6b64e5e..13c4c47 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,22 +44,14 @@
 public:
     template<typename TIn>
     void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
-                  const int ymax, const int k0, const int kmax, bool transposed) {
-        if (transposed) {
-            Transform<height, block,  true>(out, in, stride, y0, ymax, k0, kmax);
-        } else {
-            Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
-        }
+                  const int ymax, const int k0, const int kmax) {
+        Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
     }
 
     template<typename TIn>
     void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
-                  const int xmax, const int k0, const int kmax, bool transposed) {
-        if (transposed) {
-            Transform<width_vectors, block, false, true>(out, in, stride, x0, xmax, k0, kmax);
-        } else {
-            Transform<width_vectors, block,  true, true>(out, in, stride, x0, xmax, k0, kmax);
-        }
+                  const int xmax, const int k0, const int kmax) {
+        Transform<width_vectors, block,  true, true>(out, in, stride, x0, xmax, k0, kmax);
     }
 
     template<typename TOut>

diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index bdae903..c6ea079 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
index 543664b..2df5d1b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,7 +59,6 @@
             /* 'first' forces this to always run at least once, needed if the total size is <=7. */
             if ((y + 5) >= ymax) {
                 switch ((y + 5) - ymax) {
-                    /* Everything falls through in here */
                     case 4:
                         inptr1 = zerobuff;
                         // fall through

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
index 587bec3..8f0b8ae 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
index 6b742c8..9b6f4de 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,8 +38,8 @@
 
     uint8_t zerobuff[16] = { 0 };
 
-    for (uint64_t y = y0 ; y < static_cast<uint64_t>(ymax) ; y+=4) {
-        const uint8_t *inptr0 = inptr + y * ldin + k0;
+    for (int y=y0; y<ymax; y+=4) {
+        const uint8_t *inptr0 = inptr + static_cast<intptr_t>(y) * ldin + k0;
         const uint8_t *inptr1 = inptr0 + ldin;
         const uint8_t *inptr2 = inptr1 + ldin;
         const uint8_t *inptr3 = inptr2 + ldin;
@@ -52,9 +52,8 @@
         int x=(kmax-k0);
         for (;x>15;x-=16) {
             /* Cope with ragged cases by copying from a buffer of zeroes instead */
-            if ((y + 3) >= static_cast<uint64_t>(ymax)) {
+            if ((y + 3) >= ymax) {
                 switch ((y + 3) - ymax) {
-                    /* Everything falls through in here */
                     case 2:
                         inptr1 = zerobuff;
                         // fall through
@@ -90,9 +89,8 @@
 
         if (x>0) {
             /* Need to duplicate this here, in case we didn't run the main loop. */
-            if ((y + 3) >= static_cast<uint64_t>(ymax)) {
+            if ((y + 3) >= ymax) {
                 switch ((y + 3) - ymax) {
-                    /* Everything falls through in here */
                     case 2:
                         inptr1 = zerobuff;
                         // fall through

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
index 80dd6c5..3d912c4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,7 +63,6 @@
             /* 'first' forces this to always run at least once, needed if the total size is <=7. */
             if ((y + 7) >= ymax) {
                 switch ((y + 7) - ymax) {
-                    /* Everything falls through in here */
                     case 6:
                         inptr1 = zerobuff;
                         // fall through

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
index 9dfc134..701d688 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,7 +63,6 @@
             /* 'first' forces this to always run at least once, needed if the total size is <=7. */
             if ((y + 7) >= ymax) {
                 switch ((y + 7) - ymax) {
-                    /* Everything falls through in here */
                     case 6:
                         inptr1 = zerobuff;
                         // fall through

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
index 2bc7801..2546cc5 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,7 +60,7 @@
         }
     };
 
-    uint8_t zerobuff[64]; // 32 for asm loop plus up to 31 for overflow loop
+    uint8_t zerobuff[64] = { 0 }; // 32 for asm loop plus up to 31 for overflow loop
 
     for (int y=y0; y<ymax; y+=8) {
         const uint8_t *inptr0 = inptr + y * ldin + k0;
@@ -87,7 +87,6 @@
             /* 'first' forces this to always run at least once, needed if the total size is <=32. */
             if ((y + 7) >= ymax) {
                 switch ((y + 7) - ymax) {
-                    /* Everything falls through in here */
                     case 6:
                         inptr1 = zerobuff;
                         // fall through

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
index bde3274..a342d6c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,7 +63,6 @@
             /* 'first' forces this to always run at least once, needed if the total size is <=7. */
             if ((y + 7) >= ymax) {
                 switch ((y + 7) - ymax) {
-                    /* Everything falls through in here */
                     case 6:
                         inptr1 = zerobuff;
                         // fall through

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
new file mode 100644
index 0000000..37344a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp

@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include <arm_neon.h>
+#include <cstdint>
+
+#include "../asmlib.hpp"
+
+template<>
+template<>
+inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(int16_t *out, const int8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    int16_t *outptr = out;
+    const int8_t *inptr = in;
+    bool first = true;
+
+    int8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
+
+    for (int y=y0; y<ymax; y+=8) {
+        const int8_t *inptr0 = inptr + y * ldin + k0;
+        const int8_t *inptr1 = inptr0 + ldin;
+        const int8_t *inptr2 = inptr1 + ldin;
+        const int8_t *inptr3 = inptr2 + ldin;
+        const int8_t *inptr4 = inptr3 + ldin;
+        const int8_t *inptr5 = inptr4 + ldin;
+        const int8_t *inptr6 = inptr5 + ldin;
+        const int8_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x=(kmax-k0);
+        for (;(x>15) || first;x-=16) {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
+            if ((y + 7) >= ymax) {
+                switch ((y + 7) - ymax) {
+                    case 6:
+                        inptr1 = zerobuff;
+                        // fall through
+                    case 5:
+                        inptr2 = zerobuff;
+                        // fall through
+                    case 4:
+                        inptr3 = zerobuff;
+                        // fall through
+                    case 3:
+                        inptr4 = zerobuff;
+                        // fall through
+                    case 2:
+                        inptr5 = zerobuff;
+                        // fall through
+                    case 1:
+                        inptr6 = zerobuff;
+                        // fall through
+                    case 0:
+                        inptr7 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            if (first) {
+                if (x<=15) {
+                    break;
+                }
+
+                first = false;
+            }
+
+            __asm __volatile (
+                // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
+                "LDR	q0, [%[inptr0]], #16\n"
+                "LDR	q2, [%[inptr1]], #16\n"
+                "SSHLL2 v1.8h, v0.16b, #0\n"
+                "SSHLL  v0.8h, v0.8b, #0\n"
+                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+                "SSHLL2 v3.8h, v2.16b, #0\n"
+                "SSHLL  v2.8h, v2.8b, #0\n"
+                "SSHLL2 v5.8h, v4.16b, #0\n"
+                "SSHLL  v4.8h, v4.8b, #0\n"
+                "ZIP1	v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+                "SSHLL2 v7.8h, v6.16b, #0\n"
+                "SSHLL  v6.8h, v6.8b, #0\n"
+                "ZIP1	v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
+                "LDR	q8, [%[inptr4]], #16\n"
+                "LDR	q10, [%[inptr5]], #16\n"
+                "SSHLL2 v9.8h, v8.16b, #0\n"
+                "SSHLL  v8.8h, v8.8b, #0\n"
+                ASM_PREFETCH("[%[inptr1], #128]")
+                "LDR	q12, [%[inptr6]], #16\n"
+                "SSHLL2 v11.8h, v10.16b, #0\n"
+                "SSHLL  v10.8h, v10.8b, #0\n"
+                "SSHLL2 v13.8h, v12.16b, #0\n"
+                "SSHLL  v12.8h, v12.8b, #0\n"
+                "ZIP1	v18.8h, v8.8h, v12.8h\n"
+                "LDR	q14, [%[inptr7]], #16\n"
+                "SSHLL2 v15.8h, v14.16b, #0\n"
+                "SSHLL  v14.8h, v14.8b, #0\n"
+                "ZIP1	v19.8h, v10.8h, v14.8h\n"
+
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "ZIP1	v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
+                "ZIP1	v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
+                "ZIP2	v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
+                "ZIP2	v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
+                ASM_PREFETCH("[%[inptr3], #128]")
+
+                "ZIP2	v16.8h, v0.8h, v4.8h\n"
+                "ZIP2	v17.8h, v2.8h, v6.8h\n"
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v18.8h, v8.8h, v12.8h\n"
+                ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP2	v19.8h, v10.8h, v14.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
+
+                "ZIP1	v16.8h, v1.8h, v5.8h\n"
+                "ZIP1	v17.8h, v3.8h, v7.8h\n"
+                ASM_PREFETCH("[%[inptr6], #128]")
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP1	v18.8h, v9.8h, v13.8h\n"
+                "ZIP1	v19.8h, v11.8h, v15.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Third element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+                ASM_PREFETCH("[%[inptr7], #128]")
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Fourth element
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+
+                "ZIP2	v16.8h, v1.8h, v5.8h\n"
+                "ZIP2	v17.8h, v3.8h, v7.8h\n"
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v18.8h, v9.8h, v13.8h\n"
+                "ZIP2	v19.8h, v11.8h, v15.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Fifth element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Sixth element
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Seventh element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
+            );
+        }
+
+        for (;x>0;x--) {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
new file mode 100644
index 0000000..a3a269c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp

@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
+
+#include <arm_neon.h>
+#include <cstdint>
+
+#include "../asmlib.hpp"
+
+template<>
+template<>
+inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(uint16_t *out, const uint8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
+    uint16_t *outptr = out;
+    const uint8_t *inptr = in;
+    bool first = true;
+
+    uint8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
+
+    for (int y=y0; y<ymax; y+=8) {
+        const uint8_t *inptr0 = inptr + y * ldin + k0;
+        const uint8_t *inptr1 = inptr0 + ldin;
+        const uint8_t *inptr2 = inptr1 + ldin;
+        const uint8_t *inptr3 = inptr2 + ldin;
+        const uint8_t *inptr4 = inptr3 + ldin;
+        const uint8_t *inptr5 = inptr4 + ldin;
+        const uint8_t *inptr6 = inptr5 + ldin;
+        const uint8_t *inptr7 = inptr6 + ldin;
+
+        prefetch_2x(inptr0);
+        prefetch_2x(inptr1);
+        prefetch_2x(inptr2);
+        prefetch_2x(inptr3);
+        prefetch_2x(inptr4);
+        prefetch_2x(inptr5);
+        prefetch_2x(inptr6);
+        prefetch_2x(inptr7);
+
+        int x=(kmax-k0);
+        for (;(x>15) || first;x-=16) {
+            /* Cope with ragged cases by copying from a buffer of zeroes instead */
+            /* 'first' forces this to always run at least once, needed if the total size is <=7. */
+            if ((y + 7) >= ymax) {
+                switch ((y + 7) - ymax) {
+                    case 6:
+                        inptr1 = zerobuff;
+                        // fall through
+                    case 5:
+                        inptr2 = zerobuff;
+                        // fall through
+                    case 4:
+                        inptr3 = zerobuff;
+                        // fall through
+                    case 3:
+                        inptr4 = zerobuff;
+                        // fall through
+                    case 2:
+                        inptr5 = zerobuff;
+                        // fall through
+                    case 1:
+                        inptr6 = zerobuff;
+                        // fall through
+                    case 0:
+                        inptr7 = zerobuff;
+                        break;
+
+                    default:
+                        UNREACHABLE("Impossible.");
+                }
+            }
+
+            if (first) {
+                if (x<=15) {
+                    break;
+                }
+
+                first = false;
+            }
+
+            __asm __volatile (
+                // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
+                "LDR	q0, [%[inptr0]], #16\n"
+                "LDR	q2, [%[inptr1]], #16\n"
+                "USHLL2 v1.8h, v0.16b, #0\n"
+                "USHLL  v0.8h, v0.8b, #0\n"
+                "LDR	q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
+                "USHLL2 v3.8h, v2.16b, #0\n"
+                "USHLL  v2.8h, v2.8b, #0\n"
+                "USHLL2 v5.8h, v4.16b, #0\n"
+                "USHLL  v4.8h, v4.8b, #0\n"
+                "ZIP1	v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
+                ASM_PREFETCH("[%[inptr0], #128]")
+                "LDR	q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
+                "USHLL2 v7.8h, v6.16b, #0\n"
+                "USHLL  v6.8h, v6.8b, #0\n"
+                "ZIP1	v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
+                "LDR	q8, [%[inptr4]], #16\n"
+                "LDR	q10, [%[inptr5]], #16\n"
+                "USHLL2 v9.8h, v8.16b, #0\n"
+                "USHLL  v8.8h, v8.8b, #0\n"
+                ASM_PREFETCH("[%[inptr1], #128]")
+                "LDR	q12, [%[inptr6]], #16\n"
+                "USHLL2 v11.8h, v10.16b, #0\n"
+                "USHLL  v10.8h, v10.8b, #0\n"
+                "USHLL2 v13.8h, v12.16b, #0\n"
+                "USHLL  v12.8h, v12.8b, #0\n"
+                "ZIP1	v18.8h, v8.8h, v12.8h\n"
+                "LDR	q14, [%[inptr7]], #16\n"
+                "USHLL2 v15.8h, v14.16b, #0\n"
+                "USHLL  v14.8h, v14.8b, #0\n"
+                "ZIP1	v19.8h, v10.8h, v14.8h\n"
+
+                ASM_PREFETCH("[%[inptr2], #128]")
+                "ZIP1	v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
+                "ZIP1	v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
+                "ZIP2	v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
+                "ZIP2	v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
+                ASM_PREFETCH("[%[inptr3], #128]")
+
+                "ZIP2	v16.8h, v0.8h, v4.8h\n"
+                "ZIP2	v17.8h, v2.8h, v6.8h\n"
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v18.8h, v8.8h, v12.8h\n"
+                ASM_PREFETCH("[%[inptr4], #128]")
+                "ZIP2	v19.8h, v10.8h, v14.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                ASM_PREFETCH("[%[inptr5], #128]")
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
+
+                "ZIP1	v16.8h, v1.8h, v5.8h\n"
+                "ZIP1	v17.8h, v3.8h, v7.8h\n"
+                ASM_PREFETCH("[%[inptr6], #128]")
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP1	v18.8h, v9.8h, v13.8h\n"
+                "ZIP1	v19.8h, v11.8h, v15.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Third element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+                ASM_PREFETCH("[%[inptr7], #128]")
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Fourth element
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+
+                "ZIP2	v16.8h, v1.8h, v5.8h\n"
+                "ZIP2	v17.8h, v3.8h, v7.8h\n"
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v18.8h, v9.8h, v13.8h\n"
+                "ZIP2	v19.8h, v11.8h, v15.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Fifth element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+
+                "ZIP1	v20.8h, v16.8h, v17.8h\n"
+                "ZIP1	v21.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Sixth element
+                "TRN1	v24.2d, v20.2d, v21.2d\n"
+                "TRN2	v25.2d, v20.2d, v21.2d\n"
+
+                "ZIP2	v22.8h, v16.8h, v17.8h\n"
+                "ZIP2	v23.8h, v18.8h, v19.8h\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Seventh element
+                "TRN1	v24.2d, v22.2d, v23.2d\n"
+                "TRN2	v25.2d, v22.2d, v23.2d\n"
+                "STP	q24, q25, [%[outptr]], #32\n" // Eighth element
+                : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+                  [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
+                :
+                : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+                  "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
+            );
+        }
+
+        for (;x>0;x--) {
+            *outptr++ = *inptr0++;
+            *outptr++ = *inptr1++;
+            *outptr++ = *inptr2++;
+            *outptr++ = *inptr3++;
+            *outptr++ = *inptr4++;
+            *outptr++ = *inptr5++;
+            *outptr++ = *inptr6++;
+            *outptr++ = *inptr7++;
+        }
+    }
+}
+
+#endif // __aarch64__ && __ARM_FP16_ARGS

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
index ec54ce0..5ab5774 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
index 8992c10..d7de9ff 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
index 6d62733..a137f93 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
index 0080c91..974be48 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index be66cd4..b825e1c 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,12 +28,15 @@
 #include "a64_interleave_8way_32bit.hpp"
 #include "a64_interleave_8way_block4_8bit.hpp"
 #include "a64_interleave_8way_half_to_float.hpp"
+#include "a64_interleave_8way_s8_to_s16.hpp"
+#include "a64_interleave_8way_u8_to_u16.hpp"
 #include "a64_transpose_interleave_12way_16bit.hpp"
 #include "a64_transpose_interleave_12way_half_to_float.hpp"
 #include "a64_transpose_interleave_24way_16bit.hpp"
 #include "a64_transpose_interleave_8way_32bit.hpp"
 #include "sve_interleave_8way_32bit.hpp"
 #include "sve_interleave_8way_block2_16bit.hpp"
+#include "sve_interleave_8way_block2_32bit.hpp"
 #include "sve_interleave_8way_block4_16bit.hpp"
 #include "sve_interleave_8way_block4_8bit.hpp"
 #include "sve_interleave_8way_block8_8bit.hpp"

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
index 881dc7b..348d78e 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
index 4cc4311..f21933b 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,12 +36,12 @@
     {
         const int height = ymax-y;
         const long inwidth = (kmax - k0);
-        const long outwidth = (inwidth * 8 + 1) / 2;
+        const long outwidth = ((inwidth + 1) / 2) * 16;
         long inpos = 0;
         long outpos = 0;
 
         uint32_t *outptr = master_outptr;
-        master_outptr += (outwidth * 2);
+        master_outptr += outwidth;
 
         const uint32_t *inptr0 = inptr + y * ldin + k0;
         const uint32_t *inptr1 = inptr0 + ldin;
@@ -60,571 +60,535 @@
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
                     "incw %[inpos], all, mul #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "addvl %[inptr0], %[inptr0], #1\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "st1d z8.d, p0, [%[outptr]]\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip1 z0.d, z8.d, z4.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z1.d, z8.d, z4.d\n"
+                    "zip1 z2.d, z9.d, z4.d\n"
+                    "zip2 z3.d, z9.d, z4.d\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
                     "zip1 z10.d, z1.d, z4.d\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
                     "zip2 z11.d, z1.d, z4.d\n"
-                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
                     "zip1 z12.d, z2.d, z4.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
                     "zip2 z13.d, z2.d, z4.d\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z14.d, z3.d, z4.d\n"
-                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
                     "zip2 z15.d, z3.d, z4.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
-                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
-            
+
             case 2:
                 __asm __volatile(
                     "1:\n"
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "addvl %[inptr0], %[inptr0], #1\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "addvl %[inptr1], %[inptr1], #1\n"
-                    "zip1 z0.d, z8.d, z4.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z4.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z4.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "zip2 z3.d, z9.d, z4.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "mov z14.s, #0\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "incw %[inpos], all, mul #1\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "zip1 z10.d, z1.d, z4.d\n"
+                    "zip2 z11.d, z1.d, z4.d\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip1 z0.d, z8.d, z4.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z1.d, z8.d, z4.d\n"
+                    "zip1 z2.d, z9.d, z4.d\n"
+                    "zip2 z3.d, z9.d, z4.d\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip1 z4.d, z10.d, z14.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z5.d, z10.d, z14.d\n"
+                    "zip1 z6.d, z11.d, z14.d\n"
+                    "zip2 z7.d, z11.d, z14.d\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
                     "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
                     "zip2 z13.d, z2.d, z6.d\n"
-                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z14.d, z3.d, z7.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
                     "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
-            
+
             case 3:
                 __asm __volatile(
                     "1:\n"
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "incw %[inpos], all, mul #1\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
-                    "addvl %[inptr0], %[inptr0], #1\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "addvl %[inptr1], %[inptr1], #1\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "addvl %[inptr2], %[inptr2], #1\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "mov z14.s, #0\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "zip1 z6.d, z11.d, z14.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "zip2 z7.d, z11.d, z14.d\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "incw %[inpos], all, mul #1\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z10.d, z1.d, z4.d\n"
+                    "zip2 z11.d, z1.d, z4.d\n"
+                    "zip1 z12.d, z2.d, z4.d\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.d, z2.d, z4.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z0.d, z8.d, z12.d\n"
+                    "zip2 z1.d, z8.d, z12.d\n"
+                    "zip1 z2.d, z9.d, z13.d\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip2 z3.d, z9.d, z13.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z4.d, z10.d, z14.d\n"
+                    "zip2 z5.d, z10.d, z14.d\n"
+                    "zip1 z6.d, z11.d, z14.d\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip2 z7.d, z11.d, z14.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
                     "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z13.d, z2.d, z6.d\n"
-                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
                     "zip1 z14.d, z3.d, z7.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
-            
+
             case 4:
                 __asm __volatile(
                     "1:\n"
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z4.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
-                    "zip1 z10.d, z1.d, z4.d\n"
-                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
-                    "zip2 z11.d, z1.d, z4.d\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
                     "incw %[inpos], all, mul #1\n"
-                    "zip1 z12.d, z2.d, z4.d\n"
-                    "addvl %[inptr0], %[inptr0], #1\n"
-                    "zip2 z13.d, z2.d, z4.d\n"
-                    "addvl %[inptr1], %[inptr1], #1\n"
-                    "zip1 z14.d, z3.d, z4.d\n"
-                    "addvl %[inptr2], %[inptr2], #1\n"
-                    "zip2 z15.d, z3.d, z4.d\n"
-                    "addvl %[inptr3], %[inptr3], #1\n"
-                    "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
-                    "zip2 z1.d, z8.d, z12.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "zip2 z3.d, z9.d, z13.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "zip2 z5.d, z10.d, z14.d\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "zip1 z6.d, z11.d, z15.d\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "zip2 z7.d, z11.d, z15.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z10.d, z1.d, z4.d\n"
+                    "zip2 z11.d, z1.d, z4.d\n"
+                    "zip1 z12.d, z2.d, z4.d\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.d, z2.d, z4.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z14.d, z3.d, z4.d\n"
+                    "zip2 z15.d, z3.d, z4.d\n"
+                    "zip1 z0.d, z8.d, z12.d\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
+                    "zip2 z1.d, z8.d, z12.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z2.d, z9.d, z13.d\n"
+                    "zip2 z3.d, z9.d, z13.d\n"
+                    "zip1 z4.d, z10.d, z14.d\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
+                    "zip2 z5.d, z10.d, z14.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z6.d, z11.d, z15.d\n"
+                    "zip2 z7.d, z11.d, z15.d\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
                     "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
                     "zip2 z13.d, z2.d, z6.d\n"
-                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
                     "zip1 z14.d, z3.d, z7.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
-            
+
             case 5:
                 __asm __volatile(
                     "1:\n"
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z5.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
                     "incw %[inpos], all, mul #1\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
+                    "zip1 z8.d, z0.d, z4.d\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip2 z9.d, z0.d, z4.d\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
                     "zip1 z12.d, z2.d, z5.d\n"
-                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
-                    "zip1 z8.d, z0.d, z4.d\n"
-                    "addvl %[inptr0], %[inptr0], #1\n"
-                    "zip2 z9.d, z0.d, z4.d\n"
-                    "addvl %[inptr1], %[inptr1], #1\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
                     "zip2 z13.d, z2.d, z5.d\n"
-                    "addvl %[inptr2], %[inptr2], #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z14.d, z3.d, z5.d\n"
-                    "addvl %[inptr3], %[inptr3], #1\n"
                     "zip2 z15.d, z3.d, z5.d\n"
-                    "addvl %[inptr4], %[inptr4], #1\n"
                     "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.d, z8.d, z12.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
                     "zip2 z3.d, z9.d, z13.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
                     "zip2 z5.d, z10.d, z14.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z6.d, z11.d, z15.d\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
                     "zip2 z7.d, z11.d, z15.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
                     "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
                     "zip2 z13.d, z2.d, z6.d\n"
-                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
                     "zip1 z14.d, z3.d, z7.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
-            
+
             case 6:
                 __asm __volatile(
                     "1:\n"
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z6.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
                     "incw %[inpos], all, mul #1\n"
-                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
-                    "addvl %[inptr0], %[inptr0], #1\n"
                     "zip1 z12.d, z2.d, z6.d\n"
-                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
-                    "zip2 z13.d, z2.d, z6.d\n"
-                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "ld1w z5.s, p0/z, [%[inptr5]]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "addvl %[inptr1], %[inptr1], #1\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "addvl %[inptr2], %[inptr2], #1\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "addvl %[inptr3], %[inptr3], #1\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
+                    "zip2 z13.d, z2.d, z6.d\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z14.d, z3.d, z6.d\n"
-                    "addvl %[inptr4], %[inptr4], #1\n"
                     "zip2 z15.d, z3.d, z6.d\n"
-                    "addvl %[inptr5], %[inptr5], #1\n"
                     "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.d, z8.d, z12.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
                     "zip2 z3.d, z9.d, z13.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
                     "zip2 z5.d, z10.d, z14.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z6.d, z11.d, z15.d\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
                     "zip2 z7.d, z11.d, z15.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
                     "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
                     "zip2 z13.d, z2.d, z6.d\n"
-                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
                     "zip1 z14.d, z3.d, z7.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
-            
+
             case 7:
                 __asm __volatile(
                     "1:\n"
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
                     "mov z7.s, #0\n"
-                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
-                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
                     "incw %[inpos], all, mul #1\n"
-                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
-                    "addvl %[inptr0], %[inptr0], #1\n"
-                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
-                    "addvl %[inptr1], %[inptr1], #1\n"
-                    "zip1 z14.d, z3.d, z7.d\n"
-                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "ld1w z5.s, p0/z, [%[inptr5]]\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "ld1w z6.s, p0/z, [%[inptr6]]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "addvl %[inptr2], %[inptr2], #1\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "addvl %[inptr3], %[inptr3], #1\n"
                     "zip1 z12.d, z2.d, z6.d\n"
-                    "addvl %[inptr4], %[inptr4], #1\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
                     "zip2 z13.d, z2.d, z6.d\n"
-                    "addvl %[inptr5], %[inptr5], #1\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "zip1 z14.d, z3.d, z7.d\n"
                     "zip2 z15.d, z3.d, z7.d\n"
-                    "addvl %[inptr6], %[inptr6], #1\n"
                     "zip1 z0.d, z8.d, z12.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.d, z8.d, z12.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z2.d, z9.d, z13.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
                     "zip2 z3.d, z9.d, z13.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "zip1 z4.d, z10.d, z14.d\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
                     "zip2 z5.d, z10.d, z14.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z6.d, z11.d, z15.d\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
                     "zip2 z7.d, z11.d, z15.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
                     "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
                     "zip2 z13.d, z2.d, z6.d\n"
-                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
                     "zip1 z14.d, z3.d, z7.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
-            
+
             default:
             case 8:
                 __asm __volatile(
                     "1:\n"
                     "whilelt p0.s, %[inpos], %[inwidth]\n"
                     "b.none 2f\n"
-                    "ld1w z0.s, p0/z, [%[inptr0]]\n"
+                    "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
+                    "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
+                    "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
+                    "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
+                    "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
+                    "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
+                    "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
+                    "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
                     "incw %[inpos], all, mul #1\n"
-                    "ld1w z1.s, p0/z, [%[inptr1]]\n"
-                    "addvl %[inptr0], %[inptr0], #1\n"
-                    "ld1w z2.s, p0/z, [%[inptr2]]\n"
-                    "addvl %[inptr1], %[inptr1], #1\n"
-                    "ld1w z3.s, p0/z, [%[inptr3]]\n"
-                    "addvl %[inptr2], %[inptr2], #1\n"
-                    "ld1w z4.s, p0/z, [%[inptr4]]\n"
-                    "addvl %[inptr3], %[inptr3], #1\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "ld1w z5.s, p0/z, [%[inptr5]]\n"
+                    "whilelt p0.s, %[outpos], %[outwidth]\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "ld1w z6.s, p0/z, [%[inptr6]]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "ld1w z7.s, p0/z, [%[inptr7]]\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "addvl %[inptr4], %[inptr4], #1\n"
                     "zip1 z12.d, z2.d, z6.d\n"
-                    "addvl %[inptr5], %[inptr5], #1\n"
+                    "whilelt p1.s, %[outpos], %[outwidth]\n"
                     "zip2 z13.d, z2.d, z6.d\n"
-                    "addvl %[inptr6], %[inptr6], #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z14.d, z3.d, z7.d\n"
-                    "addvl %[inptr7], %[inptr7], #1\n"
                     "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
                     "zip1 z0.d, z8.d, z12.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p2.s, %[outpos], %[outwidth]\n"
                     "zip2 z1.d, z8.d, z12.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z2.d, z9.d, z13.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "zip2 z3.d, z9.d, z13.d\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
                     "zip1 z4.d, z10.d, z14.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "whilelt p3.s, %[outpos], %[outwidth]\n"
                     "zip2 z5.d, z10.d, z14.d\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z6.d, z11.d, z15.d\n"
-                    "incd %[outpos], all, mul #1\n"
                     "zip2 z7.d, z11.d, z15.d\n"
                     "zip1 z8.d, z0.d, z4.d\n"
-                    "st1d z8.d, p0, [%[outptr]]\n"
+                    "whilelt p4.s, %[outpos], %[outwidth]\n"
                     "zip2 z9.d, z0.d, z4.d\n"
-                    "st1d z9.d, p1, [%[outptr], #1, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip1 z10.d, z1.d, z5.d\n"
-                    "st1d z10.d, p2, [%[outptr], #2, MUL VL]\n"
+                    "st1w z8.s, p0, [%[outptr]]\n"
                     "zip2 z11.d, z1.d, z5.d\n"
-                    "st1d z11.d, p3, [%[outptr], #3, MUL VL]\n"
                     "zip1 z12.d, z2.d, z6.d\n"
-                    "whilelt p0.d, %[outpos], %[outwidth]\n"
+                    "whilelt p5.s, %[outpos], %[outwidth]\n"
                     "zip2 z13.d, z2.d, z6.d\n"
-                    "st1d z12.d, p0, [%[outptr], #4, MUL VL]\n"
+                    "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
                     "zip1 z14.d, z3.d, z7.d\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "incw %[outpos], all, mul #1\n"
                     "zip2 z15.d, z3.d, z7.d\n"
-                    "whilelt p1.d, %[outpos], %[outwidth]\n"
-                    "st1d z13.d, p1, [%[outptr], #5, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p2.d, %[outpos], %[outwidth]\n"
-                    "st1d z14.d, p2, [%[outptr], #6, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
-                    "whilelt p3.d, %[outpos], %[outwidth]\n"
-                    "st1d z15.d, p3, [%[outptr], #7, MUL VL]\n"
-                    "incd %[outpos], all, mul #1\n"
+                    "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
+                    "whilelt p6.s, %[outpos], %[outwidth]\n"
+                    "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
+                    "whilelt p7.s, %[outpos], %[outwidth]\n"
+                    "incw %[outpos], all, mul #1\n"
+                    "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
+                    "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
+                    "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
                     "addvl %[outptr], %[outptr], #8\n"
                     "b 1b\n"
                     "2:\n"
                 : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
                 : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
-                : "p0", "p1", "p2", "p3", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
+                : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
                 );
                 break;
-            
-            
+
+
         }
     }
 }

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
index a96a43c..ed0d58a 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 - 2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
index 63e85c1..aac5e19 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index 7dbbe91..6e47a97 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,8 +32,6 @@
 // Paranoid option for the above with assert
 // #define UNREACHABLE(why)   assert(0 && why)
 
-#define UNUSED(x)   (void)(x)
-
 template<typename T>
 inline T iceildiv(const T a, const T b) {
     return (a + b - 1) / b;

diff --git a/src/core/NEON/kernels/assembly/Helpers.cpp b/src/core/NEON/kernels/assembly/Helpers.cpp
deleted file mode 100644
index 93ea6c8..0000000
--- a/src/core/NEON/kernels/assembly/Helpers.cpp
+++ /dev/null

@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/assembly/Helpers.h"
-
-namespace arm_compute
-{
-arm_gemm::KernelDescription get_gemm_info(DataType                            input_type,
-                                          const CPUInfo                      &ci,
-                                          const unsigned int                  num_threads,
-                                          const INEGEMMWrapperKernel::Params &p,
-                                          arm_gemm::Activation                activation,
-                                          bool                                pretranspose_hint)
-{
-    switch(input_type)
-    {
-#ifdef __aarch64__
-        case DataType::QASYMM8:
-        case DataType::U8:
-        {
-            arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, pretranspose_hint);
-            return arm_gemm::get_gemm_method<uint8_t, uint32_t>(args);
-        }
-        case DataType::S8:
-        {
-            arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, pretranspose_hint);
-            return arm_gemm::get_gemm_method<int8_t, int32_t>(args);
-        }
-#endif // __aarch64__
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, pretranspose_hint);
-            return arm_gemm::get_gemm_method<__fp16, __fp16>(args);
-        }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-        {
-            arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, pretranspose_hint);
-            return arm_gemm::get_gemm_method<float, float>(args);
-        }
-        default:
-            return arm_gemm::KernelDescription();
-    }
-}
-} // namespace arm_compute

diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
index d00f204..b071be3 100644
--- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
+++ b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h b/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
similarity index 94%
rename from arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
rename to src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
index 0e3dd74..4af82f8 100644
--- a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
+++ b/src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
 #define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
 
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_gemm_compute_iface.hpp"
 
 #include "gemm_common.hpp"
 
@@ -67,15 +67,14 @@
         return _name.c_str();
     }
 
-
     void run(const Window &window, const ThreadInfo &info) override
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
         ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
 
-        auto win=arm_gemm::to_ndcoord(window);
+        auto win = arm_gemm::to_ndcoord(window);
 
-        arm_gemm::ndcoord_t thread_locator { };
+        arm_gemm::ndcoord_t thread_locator{};
 
         _kernel->execute(win, thread_locator, info.thread_id);
     }
@@ -101,7 +100,7 @@
     void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
-        _kernel         = kernel;
+        _kernel = kernel;
 
         Window win = to_window(kernel->get_window_size());
 

diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp
new file mode 100644
index 0000000..58db511
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp

@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <cstring>
+#include <memory>
+
+#include "arm_gemm_local.hpp"
+#include "gemm_common.hpp"
+
+namespace arm_gemm
+{
+enum class GemmMethod
+{
+    DEFAULT,
+    GEMV_BATCHED,
+    GEMV_PRETRANSPOSED,
+    GEMV_NATIVE_TRANSPOSED,
+    GEMM_NATIVE,
+    GEMM_HYBRID,
+    GEMM_INTERLEAVED,
+    GEMM_INTERLEAVED_2D,
+    QUANTIZE_WRAPPER,
+    QUANTIZE_WRAPPER_2D,
+    GEMM_HYBRID_QUANTIZED
+};
+
+struct KernelDescription
+{
+    GemmMethod  method         = GemmMethod::DEFAULT;
+    std::string name           = "";
+    bool        is_default     = false;
+    uint64_t    cycle_estimate = 0;
+
+    KernelDescription(GemmMethod m, std::string n, bool d = false, uint64_t c = 0)
+        : method(m), name(n), is_default(d), cycle_estimate(c)
+    {
+    }
+    KernelDescription() noexcept
+    {
+    }
+};
+
+struct GemmConfig
+{
+    GemmMethod   method           = GemmMethod::DEFAULT;
+    std::string  filter           = "";
+    unsigned int inner_block_size = 0;
+    unsigned int outer_block_size = 0;
+
+    GemmConfig(GemmMethod method)
+        : method(method)
+    {
+    }
+    GemmConfig()
+    {
+    }
+};
+
+struct Activation
+{
+    enum class Type
+    {
+        None,
+        ReLU,
+        BoundedReLU
+    };
+
+    Type  type;
+    float param1;
+    float param2;
+
+    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
+        : type(type), param1(p1), param2(p2)
+    {
+    }
+};
+
+struct GemmArgs
+{
+public:
+    const CPUInfo    *_ci;
+    unsigned int      _Msize;
+    unsigned int      _Nsize;
+    unsigned int      _Ksize;
+    unsigned int      _nbatches;
+    unsigned int      _nmulti;
+    Activation        _act;
+    int               _maxthreads;
+    const GemmConfig *_cfg;
+
+    GemmArgs(const CPUInfo *ci, const unsigned int M, const unsigned int N,
+             const unsigned int K, const unsigned int nbatches,
+             const unsigned int nmulti, Activation act, const int maxthreads,
+             const GemmConfig *cfg = nullptr)
+        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _act(act), _maxthreads(maxthreads), _cfg(cfg)
+    {
+    }
+};
+
+struct Requantize32
+{
+public:
+    const int32_t *bias                = nullptr;
+    size_t         bias_multi_stride   = 0;
+    int32_t        a_offset            = 0;
+    int32_t        b_offset            = 0;
+    int32_t        c_offset            = 0;
+    bool           per_channel_requant = false;
+    int32_t        per_layer_shift     = 0;
+    int32_t        per_layer_mul       = 0;
+    const int32_t *per_channel_shifts  = nullptr;
+    const int32_t *per_channel_muls    = nullptr;
+    int32_t        minval              = 0;
+    int32_t        maxval              = 0;
+
+    Requantize32() = default;
+
+    // Constructor for per-tensor quantization
+    Requantize32(const int32_t *bias, size_t bias_multi_stride,
+                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
+                 int32_t requant_shift, int32_t requant_mul,
+                 int32_t minv, int32_t maxv)
+        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul),
+          minval(minv), maxval(maxv)
+    {
+    }
+
+    // Constructor for per-channel quantization
+    Requantize32(const int32_t *bias, size_t bias_multi_stride,
+                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
+                 const int32_t *requant_shifts, const int32_t *requant_muls,
+                 int32_t minv, int32_t maxv)
+        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_shifts(requant_shifts),
+          per_channel_muls(requant_muls), minval(minv), maxval(maxv)
+    {
+    }
+};
+
+struct Nothing
+{
+};
+
+template <typename Top, typename Tret>
+using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>;
+
+/* Low level API calls.
+ * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
+
+/* get_gemm_method(): Given the templated types and provided parameters,
+ * which is the preferred method to implement this GEMM?  */
+template <typename Top, typename Tret, class OutputStage = Nothing>
+KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {});
+
+template <typename Top, typename Tret, class OutputStage = Nothing>
+UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {});
+
+template <typename Top, typename Tret, class OutputStage = Nothing>
+std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {});
+
+} // namespace arm_gemm

diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp b/src/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
similarity index 97%
rename from arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
rename to src/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
index 6f345c1..d620477 100644
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
+++ b/src/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,8 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/Dimensions.h"
-#include "arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp"
+
+#include "ndrange.hpp"
 
 #include <cassert>
 

diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/src/core/NEON/kernels/assembly/gemm_common.hpp
similarity index 70%
rename from arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
rename to src/core/NEON/kernels/assembly/gemm_common.hpp
index ea9b524..e9e56842 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/src/core/NEON/kernels/assembly/gemm_common.hpp

@@ -23,15 +23,12 @@
  */
 #pragma once
 
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp"
+#include "ndrange.hpp"
 
 #include <cstddef>
-#include <cassert>
 
-#define UNUSED(x)   (void)(x)
-
-namespace arm_gemm {
-
+namespace arm_gemm
+{
 // Abstract class for the GEMM/GEMV functions.
 //
 // GEMM implementations may be "native" (never require any input
@@ -41,7 +38,8 @@
 
 // The real GemmCommon class is templated based on the operand and return
 // type.  This is an interface class which is independent of those types.
-class IGemmCommon {
+class IGemmCommon
+{
 public:
     /* Pass in the pointers to the arrays to be operated on and their
      * strides.  This "generic" version uses void *s, the preferred version
@@ -50,9 +48,9 @@
      * the settings for B here are ignored.
      */
     virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                                    const void *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                          void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                                    const void *bias, /* no row or batch stride needed */   const int bias_multi_stride) = 0;
+                                    const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
+                                    void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+                                    const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
 
     /** @returns an ndrange containing ranges of the compute space which can be
      * broken up and parallelised over
@@ -71,47 +69,64 @@
      * This has an empty default implementation, as GEMMs which don't care
      * about thread count can safely ignore this.
      */
-    virtual void set_nthreads(int) { };
+    virtual void set_nthreads(int) {};
 
     /* Whether this GEMM can be dynamically scheduled or not. */
-    virtual bool supports_dynamic_scheduling() const { return false; }
+    virtual bool supports_dynamic_scheduling() const
+    {
+        return false;
+    }
 
-    /** Main execute member fucntion
+    /** Main execute member function
      * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
      * @param [in] thread_locator where are we inside of the thread space
      * @naram [in] threadid       a unique threadid
      */
-    virtual void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) = 0;
+    virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0;
 
     /*** Working space interface (optional) ***/
     /* Total number of bytes of temporary working space needed.  If zero, it's not necessary to call set_working_space(). */
-    virtual size_t get_working_size() const { return 0; }
+    virtual size_t get_working_size() const
+    {
+        return 0;
+    }
     /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
-    virtual void set_working_space(void *) { };
+    virtual void set_working_space(void *) {};
 
     /*** "Pretransposed" interface (optional) ***/
     /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
-    virtual bool B_is_pretransposed() const { return false; }
+    virtual bool B_is_pretransposed() const
+    {
+        return false;
+    }
     /* Does pretranspose still need to be done? */
-    virtual bool B_pretranspose_required() const { return false; }
+    virtual bool B_pretranspose_required() const
+    {
+        return false;
+    }
     /* Total number of bytes of space needed for pretransposed arrays. */
-    virtual size_t get_B_pretransposed_array_size() const { return 0; }
+    virtual size_t get_B_pretransposed_array_size() const
+    {
+        return 0;
+    }
     /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
     /* The "real" version of this depends on the templated operand type (see below).  */
     virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
     /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
-    virtual void set_pretransposed_B_data(void *) { }
+    virtual void set_pretransposed_B_data(void *)
+    {
+    }
 
     /*** "Quantized bias" interface (optional) ***/
     /* Set the bias vector for quantized GEMMs */
-    virtual void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride)
+    virtual void set_quantized_bias(const int32_t *, size_t)
     {
-        UNUSED(bias);
-        UNUSED(bias_multi_stride);
     }
 
     // Destructor
-    virtual ~IGemmCommon() { }
+    virtual ~IGemmCommon()
+    {
+    }
 };
 
 /* "Real" GemmCommon class which is templated on the operand and return types.
@@ -121,50 +136,53 @@
  * 'set_arrays' to capture the provided arguments in protected class
  * members, as essentially any implementation will need these.
  */
-template<typename To, typename Tr>
-class GemmCommon : public IGemmCommon {
+template <typename To, typename Tr>
+class GemmCommon : public IGemmCommon
+{
 protected:
-    const To *_Aptr=nullptr;
-    int _lda=0;
-    int _A_batch_stride=0;
-    int _A_multi_stride=0;
-    const To *_Bptr=nullptr;
-    int _ldb=0;
-    int _B_multi_stride=0;
-    Tr *_Cptr=nullptr;
-    int _ldc=0;
-    int _C_batch_stride=0;
-    int _C_multi_stride=0;
-    const Tr *_bias=nullptr;
-    int _bias_multi_stride=0;
+    const To *_Aptr              = nullptr;
+    int       _lda               = 0;
+    int       _A_batch_stride    = 0;
+    int       _A_multi_stride    = 0;
+    const To *_Bptr              = nullptr;
+    int       _ldb               = 0;
+    int       _B_multi_stride    = 0;
+    Tr       *_Cptr              = nullptr;
+    int       _ldc               = 0;
+    int       _C_batch_stride    = 0;
+    int       _C_multi_stride    = 0;
+    const Tr *_bias              = nullptr;
+    int       _bias_multi_stride = 0;
 
 public:
     /* Pass in the pointers to the arrays to be operated on and their
      * strides (templated version with appropriate types). */
     virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const To *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                  Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const Tr *bias, /* no row or batch stride needed */   const int bias_multi_stride) {
-        _Aptr = A;
-        _lda = lda;
-        _A_batch_stride = A_batch_stride;
-        _A_multi_stride = A_multi_stride;
-        _Bptr = B;
-        _ldb = ldb;
-        _B_multi_stride = B_multi_stride;
-        _Cptr = C;
-        _ldc = ldc;
-        _C_batch_stride = C_batch_stride;
-        _C_multi_stride = C_multi_stride;
-        _bias = bias;
+                            const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
+                            Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+                            const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
+    {
+        _Aptr              = A;
+        _lda               = lda;
+        _A_batch_stride    = A_batch_stride;
+        _A_multi_stride    = A_multi_stride;
+        _Bptr              = B;
+        _ldb               = ldb;
+        _B_multi_stride    = B_multi_stride;
+        _Cptr              = C;
+        _ldc               = ldc;
+        _C_batch_stride    = C_batch_stride;
+        _C_multi_stride    = C_multi_stride;
+        _bias              = bias;
         _bias_multi_stride = bias_multi_stride;
     }
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
     void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const void *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                  void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const void *bias, /* no row or batch stride needed */   const int bias_multi_stride) override {
+                            const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
+                            void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+                            const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
+    {
         set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
                    static_cast<const To *>(B), ldb, B_multi_stride,
                    static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
@@ -175,27 +193,13 @@
 
     /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
     /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
-    virtual void pretranspose_B_array(void *, const To *, const int, const int) { };
+    virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override {
+    void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
+    {
         pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
     }
 };
 
-template<typename GemmKernel>
-inline
-int unsigned get_total_window_size(const GemmKernel& kernel)
-{
-    auto window=kernel.get_window_size();
-
-    unsigned int total = 1;
-    for(unsigned i = 0; i != arm_gemm::ndrange_max; ++i)
-    {
-        total *= window.get_size(i);
-    }
-
-    return total;
-}
-
 } // namespace arm_gemm

diff --git a/src/core/NEON/kernels/assembly/ndrange.hpp b/src/core/NEON/kernels/assembly/ndrange.hpp
new file mode 100644
index 0000000..a2bb60f
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/ndrange.hpp

@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <initializer_list>
+
+namespace arm_gemm
+{
+template <unsigned int D>
+class NDRange
+{
+private:
+    std::array<unsigned int, D> m_sizes{};
+    std::array<unsigned int, D> m_totalsizes{};
+
+    class NDRangeIterator
+    {
+    private:
+        const NDRange &m_parent;
+        unsigned int   m_pos = 0;
+        unsigned int   m_end = 0;
+
+    public:
+        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
+            : m_parent(p), m_pos(s), m_end(e)
+        {
+        }
+
+        bool done() const
+        {
+            return (m_pos >= m_end);
+        }
+
+        unsigned int dim(unsigned int d) const
+        {
+            unsigned int r = m_pos;
+
+            if(d < (D - 1))
+            {
+                r %= m_parent.m_totalsizes[d];
+            }
+
+            if(d > 0)
+            {
+                r /= m_parent.m_totalsizes[d - 1];
+            }
+
+            return r;
+        }
+
+        bool next_dim0()
+        {
+            m_pos++;
+
+            return !done();
+        }
+
+        bool next_dim1()
+        {
+            m_pos += m_parent.m_sizes[0] - dim(0);
+
+            return !done();
+        }
+
+        unsigned int dim0_max() const
+        {
+            unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
+
+            return dim(0) + offset;
+        }
+    };
+
+    void set_totalsizes()
+    {
+        unsigned int t = 1;
+
+        for(unsigned int i = 0; i < D; i++)
+        {
+            if(m_sizes[i] == 0)
+            {
+                m_sizes[i] = 1;
+            }
+
+            t *= m_sizes[i];
+
+            m_totalsizes[i] = t;
+        }
+    }
+
+public:
+    NDRange &operator=(const NDRange &rhs) = default;
+    NDRange(const NDRange &rhs)            = default;
+
+    template <typename... T>
+    NDRange(T... ts)
+        : m_sizes{ ts... }
+    {
+        set_totalsizes();
+    }
+
+    NDRange(const std::array<unsigned int, D> &n)
+        : m_sizes(n)
+    {
+        set_totalsizes();
+    }
+
+    NDRangeIterator iterator(unsigned int start, unsigned int end) const
+    {
+        return NDRangeIterator(*this, start, end);
+    }
+
+    unsigned int total_size() const
+    {
+        return m_totalsizes[D - 1];
+    }
+
+    unsigned int get_size(unsigned int v) const
+    {
+        return m_sizes[v];
+    }
+};
+
+/** NDCoordinate builds upon a range, but specifies a starting position
+ * in addition to a size which it inherits from NDRange
+ */
+template <unsigned int N>
+class NDCoordinate : public NDRange<N>
+{
+    using int_t     = unsigned int;
+    using ndrange_t = NDRange<N>;
+
+    std::array<int_t, N> m_positions{};
+
+public:
+    NDCoordinate &operator=(const NDCoordinate &rhs) = default;
+    NDCoordinate(const NDCoordinate &rhs)            = default;
+    NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>> &list)
+    {
+        std::array<int_t, N> sizes{};
+
+        std::size_t i = 0;
+        for(auto &p : list)
+        {
+            m_positions[i] = p.first;
+            sizes[i++]     = p.second;
+        }
+
+        //update the parents sizes
+        static_cast<ndrange_t &>(*this) = ndrange_t(sizes);
+    }
+
+    int_t get_position(int_t d) const
+    {
+        assert(d < N);
+
+        return m_positions[d];
+    }
+
+    void set_position(int_t d, int_t v)
+    {
+        assert(d < N);
+
+        m_positions[d] = v;
+    }
+
+    int_t get_position_end(int_t d) const
+    {
+        return get_position(d) + ndrange_t::get_size(d);
+    }
+}; //class NDCoordinate
+
+using ndrange_t = NDRange<6>;
+using ndcoord_t = NDCoordinate<6>;
+
+} // namespace arm_gemm

diff --git a/src/core/NEON/kernels/convolution/common/padding.cpp b/src/core/NEON/kernels/convolution/common/padding.cpp
index 88b37b8..f57706f 100644
--- a/src/core/NEON/kernels/convolution/common/padding.cpp
+++ b/src/core/NEON/kernels/convolution/common/padding.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/common/qasymm8.cpp b/src/core/NEON/kernels/convolution/common/qasymm8.cpp
index 64e3156..11e33b5 100644
--- a/src/core/NEON/kernels/convolution/common/qasymm8.cpp
+++ b/src/core/NEON/kernels/convolution/common/qasymm8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/common/qsymm8.cpp b/src/core/NEON/kernels/convolution/common/qsymm8.cpp
index e50263a..74ee249 100644
--- a/src/core/NEON/kernels/convolution/common/qsymm8.cpp
+++ b/src/core/NEON/kernels/convolution/common/qsymm8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/common/utils.cpp b/src/core/NEON/kernels/convolution/common/utils.cpp
index 45847bb..496a14c 100644
--- a/src/core/NEON/kernels/convolution/common/utils.cpp
+++ b/src/core/NEON/kernels/convolution/common/utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
index 1272754..864c6e2 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
index 010dd81..2554436 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index 4661373..2142c43 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index eb2b37a..b798b8c 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index d95332b..89d1f22 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
index 6481055..27bfb84 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
index 3d6777b..e56583d 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp
index ffe7844..99f0f53 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp
index 331e158..c13dd70 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
index 21ea350..bddae51 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
index f683c6e..b09f620 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
index 88d8e9f..1ae48b9 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
index 22231cf..266d13d 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_base.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
index ac75cc9..4130188 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
index 87d2bfd..a00a1ef 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
index e19e4c6..b0d8126 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
index 81eb7b3..e8b4c7b 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
index b27430c..68e20d9 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/padding.cpp b/src/core/NEON/kernels/convolution/winograd/padding.cpp
index 04aa472..1d44c38 100644
--- a/src/core/NEON/kernels/convolution/winograd/padding.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/padding.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd.cpp b/src/core/NEON/kernels/convolution/winograd/winograd.cpp
index 867bb3c..d556112 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp b/src/core/NEON/kernels/convolution/winograd/winograd.hpp
similarity index 99%
rename from arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp
rename to src/core/NEON/kernels/convolution/winograd/winograd.hpp
index bc0d9d4..ac82e7b 100644
--- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 
 #pragma once
 
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
+#include "arm_gemm.hpp"
 
 #include <cstddef>
 #include <utility>

diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp
similarity index 99%
rename from arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp
rename to src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp
index ed8fede..3cfb6e6 100644
--- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_layer.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp
index 8e4bebc..c0f50be 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp
index 5040ec1..8f6e9e8 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp
index 1ea68b5..5e6ac97 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp
index 9393785..69d3e8f 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp
index 3eaf977..d0ce307 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
index e4aad76..0095e6c 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp
index e45f186..27d2081 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/kernel.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp
index ed88098..c1fb559 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp
index f231bdd..8e25790 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp
index 5136bc1..8b0b470 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp
index 0f911f1..3996be1 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp
index 49a3f41..c35037e 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp
index 37b890d..3c071bd 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp
index 292999c..1eb9b53 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp
index 05f06a8..528cd8c 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp
index 37ae43f..2ee377c 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp
index 8fab6db..3fde4a7 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp
index 79f4fa3..26ab56f 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp
index fb3d712..eeda274 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp
index 3c4f8b4..3101865 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp
index 9e7040b..7c2c718 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp
index 4572348..9b42224 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/PyramidInfo.cpp b/src/core/PyramidInfo.cpp
index 1c12eee..7a8fa0b 100644
--- a/src/core/PyramidInfo.cpp
+++ b/src/core/PyramidInfo.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp
index da6e5f6..99858e2 100644
--- a/src/core/Rounding.cpp
+++ b/src/core/Rounding.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/Size2D.cpp b/src/core/Size2D.cpp
index 8aa801b..6eb46e5 100644
--- a/src/core/Size2D.cpp
+++ b/src/core/Size2D.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index be8560f..bb8ecf6 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,12 +41,6 @@
  */
 TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coordinates coords)
 {
-    // Subtensor should not index in x, y dimensions.
-    ARM_COMPUTE_ERROR_ON((coords.x() != 0) || (coords.y() != 0));
-
-    // Cannot extend on x, y ?
-    ARM_COMPUTE_ERROR_ON((parent_shape.total_size() != 0) && (parent_shape.x() != shape.x()) && (parent_shape.y() != shape.y()));
-
     // Extend shape
     for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
@@ -70,6 +64,7 @@
     : _parent(parent), _tensor_shape(tensor_shape), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(extend_parent)
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
+
     // Check if subtensor is valid if parent is configured
     if(parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
@@ -118,6 +113,17 @@
     ARM_COMPUTE_ERROR_ON(!_parent->is_resizable());
     ARM_COMPUTE_ERROR_ON(_parent->total_size() == 0);
 
+    // Check that you do not extend padding on sub-tensors unless XY shape matches parent tensor
+    // TODO(COMPMID-3558): Remove _extend_parent check
+    if(!_extend_parent && (padding.left || padding.right))
+    {
+        ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().x() != tensor_shape().x());
+    }
+    if(!_extend_parent && (padding.top || padding.bottom))
+    {
+        ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().y() != tensor_shape().y());
+    }
+
     // Extend parent padding if required
     return _parent->extend_padding(padding);
 }

diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index b86a4cf..0971d2a 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/TracePoint.cpp b/src/core/TracePoint.cpp
index b5c1818..06d9527 100644
--- a/src/core/TracePoint.cpp
+++ b/src/core/TracePoint.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index bdde082..cec7a1b 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -456,18 +456,6 @@
     return QuantizationInfo(1.f / 256, 0);
 }
 
-float arm_compute::calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners)
-{
-    const size_t offset = align_corners ? 1 : 0;
-    const auto   in     = input_size - offset;
-    const auto   out    = output_size - offset;
-
-    ARM_COMPUTE_ERROR_ON((input_size == 0 || output_size == 0) && offset == 1);
-    ARM_COMPUTE_ERROR_ON(out == 0);
-
-    return static_cast<float>(in) / static_cast<float>(out);
-}
-
 std::pair<int32_t, int32_t> arm_compute::get_quantized_activation_min_max(ActivationLayerInfo act_info, DataType data_type, UniformQuantizationInfo oq_info)
 {
     const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type);

diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index f9bd6d6..bd5e494 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -176,16 +176,12 @@
 arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
                                                             const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
 {
-    // Subtensor should not index in x, y dimensions.
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(((coords.x() != 0) || (coords.y() != 0)), function, file, line);
-    // Subtensor shape should match parent tensor in x, y dimensions.
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(((parent_shape.x() != shape.x()) || (parent_shape.y() != shape.y())), function, file, line);
-
     // Check dimensions
     for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_LOC(((coords[i] >= static_cast<int>(parent_shape[i])) || (coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]))),
-                                        function, file, line);
+        const bool invalid_idx        = coords[i] >= static_cast<int>(parent_shape[i]);
+        const bool out_of_bounds_size = coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(invalid_idx || out_of_bounds_size, function, file, line);
     }
     return arm_compute::Status{};
 }

diff --git a/src/core/Version.cpp b/src/core/Version.cpp
index fbbb6d8..cde7a18 100644
--- a/src/core/Version.cpp
+++ b/src/core/Version.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/core/utils/ScaleUtils.cpp
similarity index 60%
copy from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
copy to src/core/utils/ScaleUtils.cpp
index 36f84d8..d46ca0e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/src/core/utils/ScaleUtils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,37 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#pragma once
+#include "src/core/utils/ScaleUtils.h"
+#include "arm_compute/core/Helpers.h"
 
-#ifdef __aarch64__
+float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners)
+{
+    const size_t offset = (align_corners && output_size > 1) ? 1 : 0;
+    const auto   in     = input_size - offset;
+    const auto   out    = output_size - offset;
 
-namespace arm_gemm {
+    ARM_COMPUTE_ERROR_ON((input_size == 0 || output_size == 0) && offset == 1);
+    ARM_COMPUTE_ERROR_ON(out == 0);
 
-// Actual kernel implementations
-void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
-
-// Transposed SGEMV strategy class.
-class sgemv_trans {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
-
-    /* Kernel blocking parameters */
-    static unsigned int out_width() {
-        return 96;
-    }
-
-    static unsigned int k_unroll() {
-        return 1;
-    }
-
-    kern_type kernel=a64_sgemv_trans;
-
-    sgemv_trans(const CPUInfo *ci) { UNUSED(ci); }
-};
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
+    return static_cast<float>(in) / static_cast<float>(out);
+}
\ No newline at end of file

diff --git a/src/core/utils/ScaleUtils.h b/src/core/utils/ScaleUtils.h
new file mode 100644
index 0000000..3cc986b
--- /dev/null
+++ b/src/core/utils/ScaleUtils.h

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef UTILS_CORE_SCALEUTILS_H
+#define UTILS_CORE_SCALEUTILS_H
+
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+#include <cstdlib>
+
+namespace arm_compute
+{
+namespace scale_utils
+{
+/** Returns resize ratio between input and output with consideration of aligned corners
+ *
+ * @param[in] input_size    The input size
+ * @param[in] output_size   the output size
+ * @param[in] align_corners True to align corners of input and output. Defaults to false.
+ *
+ * @return The ratio between input and output (i.e., the input size divided by the output size)
+ */
+float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners = false);
+
+/** Returns if aligned corners are allowed for the given sampling policy
+ *
+ * @param[in] sampling_policy The sampling policy to consider
+ *
+ * @return True if aligned corners are allowed
+ */
+inline bool is_align_corners_allowed_sampling_policy(SamplingPolicy sampling_policy)
+{
+    return sampling_policy != SamplingPolicy::CENTER;
+}
+} // namespace scale_utils
+} // namespace arm_compute
+#endif /* UTILS_CORE_SCALEUTILS_H */
\ No newline at end of file

diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp
index 7ff2fdf..4c2f8fa 100644
--- a/src/core/utils/helpers/fft.cpp
+++ b/src/core/utils/helpers/fft.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index cd874b2..84302ea 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/utils/io/FileHandler.cpp b/src/core/utils/io/FileHandler.cpp
index 2c67806..95fc2e3 100644
--- a/src/core/utils/io/FileHandler.cpp
+++ b/src/core/utils/io/FileHandler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/utils/logging/FilePrinter.cpp b/src/core/utils/logging/FilePrinter.cpp
index b699afc..55e78f9 100644
--- a/src/core/utils/logging/FilePrinter.cpp
+++ b/src/core/utils/logging/FilePrinter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/utils/logging/Helpers.cpp b/src/core/utils/logging/Helpers.cpp
index f5ab608..c3df7f6 100644
--- a/src/core/utils/logging/Helpers.cpp
+++ b/src/core/utils/logging/Helpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/utils/logging/Logger.cpp b/src/core/utils/logging/Logger.cpp
index 7f954ac..05c5fa0 100644
--- a/src/core/utils/logging/Logger.cpp
+++ b/src/core/utils/logging/Logger.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp
index 4880e9e..c281d88 100644
--- a/src/core/utils/logging/LoggerRegistry.cpp
+++ b/src/core/utils/logging/LoggerRegistry.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/utils/misc/MMappedFile.cpp b/src/core/utils/misc/MMappedFile.cpp
index 6d0b0be..0b94141 100644
--- a/src/core/utils/misc/MMappedFile.cpp
+++ b/src/core/utils/misc/MMappedFile.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 8e0e92c..49e39f6 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index edc8d6b..ad6f200 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index 218e6ce..72c45fa 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp
index 4d97807..7b74c2f 100644
--- a/src/graph/GraphContext.cpp
+++ b/src/graph/GraphContext.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index 996e50b..9d53172 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index b0c3137..93f0854 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/PassManager.cpp b/src/graph/PassManager.cpp
index 99d9798..f7e214c 100644
--- a/src/graph/PassManager.cpp
+++ b/src/graph/PassManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index 205ef11..f69d49d 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/TypeLoader.cpp b/src/graph/TypeLoader.cpp
index 81a405b..a1b3fd8 100644
--- a/src/graph/TypeLoader.cpp
+++ b/src/graph/TypeLoader.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp
index 7af1812..6489058 100644
--- a/src/graph/Utils.cpp
+++ b/src/graph/Utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/Workload.cpp b/src/graph/Workload.cpp
index 0d2a405..7020503 100644
--- a/src/graph/Workload.cpp
+++ b/src/graph/Workload.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/algorithms/TopologicalSort.cpp b/src/graph/algorithms/TopologicalSort.cpp
index 0fbf6e3..3647e13 100644
--- a/src/graph/algorithms/TopologicalSort.cpp
+++ b/src/graph/algorithms/TopologicalSort.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/BackendRegistry.cpp b/src/graph/backends/BackendRegistry.cpp
index dccfefc..46b4f99 100644
--- a/src/graph/backends/BackendRegistry.cpp
+++ b/src/graph/backends/BackendRegistry.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
index 0159592..b2d58e3 100644
--- a/src/graph/backends/CL/CLDeviceBackend.cpp
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index 312e09a..d41da4b 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,7 @@
 struct CLTargetInfo
 {
     using TensorType         = arm_compute::ICLTensor;
+    using SrcTensorType      = const arm_compute::ICLTensor;
     using TensorConcreteType = CLTensor;
     static Target TargetType;
 };
@@ -65,6 +66,12 @@
     using Multiplication = CLPixelWiseMultiplication;
 };
 
+/** Collection of CL unary element-wise functions */
+struct CLUnaryEltwiseFunctions
+{
+    using Exp = CLExpLayer;
+};
+
 /** Function and tensor types to be used inside a CL fused convolution/batch normalization layer */
 struct CLFusedLayerTypes
 {
@@ -252,6 +259,8 @@
             return detail::create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::EltwiseLayer:
             return detail::create_eltwise_layer<CLEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+        case NodeType::UnaryEltwiseLayer:
+            return detail::create_unary_eltwise_layer<CLUnaryEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:
             return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:

diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index ddb8e3d..cc618db 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,20 @@
 {
 namespace backends
 {
+/** Collection of CL element-wise functions */
+struct CLEltwiseLayerFunctions
+{
+    using ArithmeticAddition      = CLArithmeticAddition;
+    using ArithmeticSubtraction   = CLArithmeticSubtraction;
+    using PixelWiseMultiplication = CLPixelWiseMultiplication;
+};
+
+/** Collection of CL unary element-wise functions */
+struct CLUnaryEltwiseLayerFunctions
+{
+    using ExpLayer = CLExpLayer;
+};
+
 Status CLNodeValidator::validate(INode *node)
 {
     if(node == nullptr)
@@ -91,6 +105,10 @@
             return detail::validate_upsample_layer<CLUpsampleLayer>(*polymorphic_downcast<UpsampleLayerNode *>(node));
         case NodeType::YOLOLayer:
             return detail::validate_yolo_layer<CLYOLOLayer>(*polymorphic_downcast<YOLOLayerNode *>(node));
+        case NodeType::EltwiseLayer:
+            return detail::validate_eltwise_Layer<CLEltwiseLayerFunctions>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+        case NodeType::UnaryEltwiseLayer:
+            return detail::validate_unary_eltwise_layer<CLUnaryEltwiseLayerFunctions>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         default:
             return Status{};
     }

diff --git a/src/graph/backends/CL/CLSubTensorHandle.cpp b/src/graph/backends/CL/CLSubTensorHandle.cpp
index 016dca7..ada0d68 100644
--- a/src/graph/backends/CL/CLSubTensorHandle.cpp
+++ b/src/graph/backends/CL/CLSubTensorHandle.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/CL/CLTensorHandle.cpp b/src/graph/backends/CL/CLTensorHandle.cpp
index 891c784..a496c2c 100644
--- a/src/graph/backends/CL/CLTensorHandle.cpp
+++ b/src/graph/backends/CL/CLTensorHandle.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/GLES/GCDeviceBackend.cpp b/src/graph/backends/GLES/GCDeviceBackend.cpp
index bb674ce..252093c 100644
--- a/src/graph/backends/GLES/GCDeviceBackend.cpp
+++ b/src/graph/backends/GLES/GCDeviceBackend.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index 075e7af..8ecb593 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,8 @@
 /** Target specific information structure used to pass information to the layer templates */
 struct GCTargetInfo
 {
-    using TensorType = arm_compute::IGCTensor;
+    using TensorType    = arm_compute::IGCTensor;
+    using SrcTensorType = TensorType;
     static Target TargetType;
 };
 

diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
index 15a66f4..159e512 100644
--- a/src/graph/backends/GLES/GCNodeValidator.cpp
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/GLES/GCTensorHandle.cpp b/src/graph/backends/GLES/GCTensorHandle.cpp
index 8f59262..94e8813 100644
--- a/src/graph/backends/GLES/GCTensorHandle.cpp
+++ b/src/graph/backends/GLES/GCTensorHandle.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
index b568b79..adb87a9 100644
--- a/src/graph/backends/NEON/NEDeviceBackend.cpp
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 454215e..4fee630 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,13 +47,14 @@
 struct NETargetInfo
 {
     using TensorType         = arm_compute::ITensor;
+    using SrcTensorType      = const arm_compute::ITensor;
     using TensorConcreteType = arm_compute::Tensor;
     static Target TargetType;
 };
 
 Target NETargetInfo::TargetType = Target::NEON;
 
-/** Collection of CL convolution functions */
+/** Collection of NEON convolution functions */
 struct NEConvolutionLayerFunctions
 {
     using GenericConvolutionLayer  = NEConvolutionLayer;
@@ -62,7 +63,7 @@
     using WinogradConvolutionLayer = NEWinogradConvolutionLayer;
 };
 
-/** Collection of CL element-wise functions */
+/** Collection of NEON element-wise functions */
 struct NEEltwiseFunctions
 {
     using Addition       = NEArithmeticAddition;
@@ -70,6 +71,12 @@
     using Multiplication = NEPixelWiseMultiplication;
 };
 
+/** Collection of NEON unary element-wise functions */
+struct NEUnaryEltwiseFunctions
+{
+    using Exp = NEExpLayer;
+};
+
 /** Function and tensor types to be used inside a NEON fused convolution/batch normalization layer */
 struct NEFusedLayerTypes
 {
@@ -143,6 +150,8 @@
             return detail::create_detection_post_process_layer<NEDetectionPostProcessLayer, NETargetInfo>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::EltwiseLayer:
             return detail::create_eltwise_layer<NEEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+        case NodeType::UnaryEltwiseLayer:
+            return detail::create_unary_eltwise_layer<NEUnaryEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:
             return detail::create_flatten_layer<NEFlattenLayer, NETargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:

diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index 0a31072..a5d22fb 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,6 +38,20 @@
 {
 namespace backends
 {
+/** Collection of NEON element-wise functions */
+struct NEEltwiseLayerFunctions
+{
+    using ArithmeticAddition      = NEArithmeticAddition;
+    using ArithmeticSubtraction   = NEArithmeticSubtraction;
+    using PixelWiseMultiplication = NEPixelWiseMultiplication;
+};
+
+/** Collection of NEON unary element-wise functions */
+struct NEUnaryEltwiseLayerFunctions
+{
+    using ExpLayer = NEExpLayer;
+};
+
 Status NENodeValidator::validate(INode *node)
 {
     if(node == nullptr)
@@ -91,6 +105,10 @@
             return detail::validate_upsample_layer<NEUpsampleLayer>(*polymorphic_downcast<UpsampleLayerNode *>(node));
         case NodeType::YOLOLayer:
             return detail::validate_yolo_layer<NEYOLOLayer>(*polymorphic_downcast<YOLOLayerNode *>(node));
+        case NodeType::EltwiseLayer:
+            return detail::validate_eltwise_Layer<NEEltwiseLayerFunctions>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+        case NodeType::UnaryEltwiseLayer:
+            return detail::validate_unary_eltwise_layer<NEUnaryEltwiseLayerFunctions>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         default:
             return Status{};
     }

diff --git a/src/graph/backends/NEON/NESubTensorHandle.cpp b/src/graph/backends/NEON/NESubTensorHandle.cpp
index c0acedd..36f29d0 100644
--- a/src/graph/backends/NEON/NESubTensorHandle.cpp
+++ b/src/graph/backends/NEON/NESubTensorHandle.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp
index d58c45b..c8fc3f1 100644
--- a/src/graph/backends/NEON/NETensorHandle.cpp
+++ b/src/graph/backends/NEON/NETensorHandle.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
index d5192e9..fd16625 100644
--- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index e9aeb00..d5752a9 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/frontend/Stream.cpp b/src/graph/frontend/Stream.cpp
index c04a426..44c8400 100644
--- a/src/graph/frontend/Stream.cpp
+++ b/src/graph/frontend/Stream.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/frontend/SubStream.cpp b/src/graph/frontend/SubStream.cpp
index e8bd23a..4b42207 100644
--- a/src/graph/frontend/SubStream.cpp
+++ b/src/graph/frontend/SubStream.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
index 30d6700..fa63f56 100644
--- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index f8494a8..e3d3812 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
index 3b06537..61639a8 100644
--- a/src/graph/mutators/InPlaceOperationMutator.cpp
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,47 @@
 {
 namespace graph
 {
+namespace
+{
+// Check if the output edges of the parent node are separate tensors. If not,
+// it means the same output is connected to multiple nodes and computations on
+// these nodes cannot be done in-place.
+bool output_edges_are_separate_tensors(Graph &g, const Edge *input_edge)
+{
+    const auto parent_node   = input_edge->producer();
+    const auto input_tensor  = input_edge->tensor();
+    const auto input_edge_id = input_edge->id();
+
+    if(parent_node == nullptr)
+    {
+        return false;
+    }
+
+    const auto output_edges = parent_node->output_edges();
+
+    // If the output is connected to only one edge, then computations can
+    // be done in-place.
+    if(output_edges.size() == 1)
+    {
+        return true;
+    }
+
+    return std::all_of(output_edges.begin(),
+                       output_edges.end(),
+                       [&](const EdgeID & edge_id)
+    {
+        // Skip check on current input edge
+        if(edge_id == input_edge_id)
+        {
+            return true;
+        }
+
+        auto edge = g.edge(edge_id);
+        return edge->tensor() != input_tensor;
+    });
+}
+} // namespace
+
 const char *InPlaceOperationMutator::name()
 {
     return "InPlaceOperationMutator";
@@ -42,7 +83,14 @@
 
 void InPlaceOperationMutator::mutate(Graph &g)
 {
-    std::set<NodeType> in_place_nodes = { NodeType::BatchNormalizationLayer, NodeType::ActivationLayer, NodeType::PrintLayer };
+    std::set<NodeType> in_place_nodes =
+    {
+        NodeType::ActivationLayer,
+        NodeType::BatchNormalizationLayer,
+        NodeType::EltwiseLayer,
+        NodeType::UnaryEltwiseLayer,
+        NodeType::PrintLayer
+    };
 
     // Not interested in the order of nodes
     for(auto &node : g.nodes())
@@ -53,7 +101,7 @@
             Edge *input_edge = node->input_edge(0);
 
             // Check if parent has a single output if yes then force in place calculation else not
-            if((input_edge != nullptr) && (input_edge->producer() != nullptr) && (input_edge->producer()->output_edges().size() == 1))
+            if((input_edge != nullptr) && output_edges_are_separate_tensors(g, input_edge))
             {
                 // Get current and new output tensors
                 auto current_output_tensor = node->output(0);

diff --git a/src/graph/mutators/NodeExecutionMethodMutator.cpp b/src/graph/mutators/NodeExecutionMethodMutator.cpp
index 72e2645..48bb9f7 100644
--- a/src/graph/mutators/NodeExecutionMethodMutator.cpp
+++ b/src/graph/mutators/NodeExecutionMethodMutator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index ae53b8f..afc4452 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -226,6 +226,12 @@
         return;
     }
 
+    // EltwiseLayerNode can only be fused when dataype is float
+    if(n_node->type() == NodeType::EltwiseLayer && !is_data_type_float(n_node->output(0)->desc().data_type))
+    {
+        return;
+    }
+
     ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
                                   << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
 

diff --git a/src/graph/mutators/SplitLayerSubTensorMutator.cpp b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
index 3ba7307..359bba4 100644
--- a/src/graph/mutators/SplitLayerSubTensorMutator.cpp
+++ b/src/graph/mutators/SplitLayerSubTensorMutator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -82,7 +82,7 @@
 
                 auto *split_node = arm_compute::utils::cast::polymorphic_downcast<SplitLayerNode *>(node);
 
-                const unsigned int axis          = split_node->axis();
+                const int          axis          = split_node->axis();
                 const unsigned int num_splits    = split_node->num_splits();
                 const bool         extend_parent = (axis < 2);
 
@@ -92,7 +92,7 @@
                     Tensor           *output_tensor = node->output(i);
                     const TensorShape output_shape  = output_tensor->desc().shape;
                     Coordinates       coords;
-                    std::tie(std::ignore, coords) = SplitLayerNode::compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
+                    std::tie(std::ignore, coords) = split_node->compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
 
                     backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(output_tensor->desc().target);
                     std::unique_ptr<ITensorHandle> handle  = backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);

diff --git a/src/graph/mutators/SyntheticDataTypeMutator.cpp b/src/graph/mutators/SyntheticDataTypeMutator.cpp
index 0a9f505..dbbebdf 100644
--- a/src/graph/mutators/SyntheticDataTypeMutator.cpp
+++ b/src/graph/mutators/SyntheticDataTypeMutator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/ActivationLayerNode.cpp b/src/graph/nodes/ActivationLayerNode.cpp
index 6c0a7dd..cf65d83 100644
--- a/src/graph/nodes/ActivationLayerNode.cpp
+++ b/src/graph/nodes/ActivationLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/BatchNormalizationLayerNode.cpp b/src/graph/nodes/BatchNormalizationLayerNode.cpp
index 3d392bd..ceca0e2 100644
--- a/src/graph/nodes/BatchNormalizationLayerNode.cpp
+++ b/src/graph/nodes/BatchNormalizationLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
index ad261e3..f3f4f91 100644
--- a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
+++ b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/ChannelShuffleLayerNode.cpp b/src/graph/nodes/ChannelShuffleLayerNode.cpp
index 08fcce1..5102e4b 100644
--- a/src/graph/nodes/ChannelShuffleLayerNode.cpp
+++ b/src/graph/nodes/ChannelShuffleLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/ConcatenateLayerNode.cpp b/src/graph/nodes/ConcatenateLayerNode.cpp
index 5f13b90..3f3c70f 100644
--- a/src/graph/nodes/ConcatenateLayerNode.cpp
+++ b/src/graph/nodes/ConcatenateLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/ConstNode.cpp b/src/graph/nodes/ConstNode.cpp
index 2f3cd14..eb96d63 100644
--- a/src/graph/nodes/ConstNode.cpp
+++ b/src/graph/nodes/ConstNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
index 2253934..a982570 100644
--- a/src/graph/nodes/ConvolutionLayerNode.cpp
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/DeconvolutionLayerNode.cpp b/src/graph/nodes/DeconvolutionLayerNode.cpp
index 2daeaac..3542d5a 100644
--- a/src/graph/nodes/DeconvolutionLayerNode.cpp
+++ b/src/graph/nodes/DeconvolutionLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
index 3e99973..42fb0fd 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/DequantizationLayerNode.cpp b/src/graph/nodes/DequantizationLayerNode.cpp
index 27134b4..14c4752 100644
--- a/src/graph/nodes/DequantizationLayerNode.cpp
+++ b/src/graph/nodes/DequantizationLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/DetectionOutputLayerNode.cpp b/src/graph/nodes/DetectionOutputLayerNode.cpp
index c2d9f24..fc6f531 100644
--- a/src/graph/nodes/DetectionOutputLayerNode.cpp
+++ b/src/graph/nodes/DetectionOutputLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/DetectionPostProcessLayerNode.cpp b/src/graph/nodes/DetectionPostProcessLayerNode.cpp
index 4a5df1a..2c5005a 100644
--- a/src/graph/nodes/DetectionPostProcessLayerNode.cpp
+++ b/src/graph/nodes/DetectionPostProcessLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/DummyNode.cpp b/src/graph/nodes/DummyNode.cpp
index e641181..6fa9fba 100644
--- a/src/graph/nodes/DummyNode.cpp
+++ b/src/graph/nodes/DummyNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/EltwiseLayerNode.cpp b/src/graph/nodes/EltwiseLayerNode.cpp
index 92d183e..3149a9a 100644
--- a/src/graph/nodes/EltwiseLayerNode.cpp
+++ b/src/graph/nodes/EltwiseLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,11 @@
     return descriptor.fused_activation;
 }
 
+QuantizationInfo EltwiseLayerNode::output_quant_info() const
+{
+    return descriptor.out_quant_info;
+}
+
 void EltwiseLayerNode::set_fused_activation(ActivationLayerInfo fused_activation)
 {
     descriptor.fused_activation = fused_activation;
@@ -100,5 +105,62 @@
 {
     v.visit(*this);
 }
+
+UnaryEltwiseLayerNode::UnaryEltwiseLayerNode(const descriptors::UnaryEltwiseLayerDescriptor &descriptor)
+    : descriptor(descriptor)
+{
+    _input_edges.resize(1, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+descriptors::UnaryEltwiseLayerDescriptor UnaryEltwiseLayerNode::eltwise_descriptor() const
+{
+    return descriptor;
+}
+
+void UnaryEltwiseLayerNode::set_fused_activation(ActivationLayerInfo fused_activation)
+{
+    descriptor.fused_activation = fused_activation;
+}
+
+bool UnaryEltwiseLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor UnaryEltwiseLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+
+    const Tensor *src = input(0);
+    ARM_COMPUTE_ERROR_ON(src == nullptr);
+
+    auto output_info = src->desc();
+
+    if(!descriptor.out_quant_info.empty())
+    {
+        output_info.set_quantization_info(descriptor.out_quant_info);
+    }
+
+    return output_info;
+}
+
+NodeType UnaryEltwiseLayerNode::type() const
+{
+    return NodeType::UnaryEltwiseLayer;
+}
+
+void UnaryEltwiseLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+
 } // namespace graph
 } // namespace arm_compute

diff --git a/src/graph/nodes/FlattenLayerNode.cpp b/src/graph/nodes/FlattenLayerNode.cpp
index baae555..48519a1 100644
--- a/src/graph/nodes/FlattenLayerNode.cpp
+++ b/src/graph/nodes/FlattenLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
index 34c432a..442f636 100644
--- a/src/graph/nodes/FullyConnectedLayer.cpp
+++ b/src/graph/nodes/FullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
index 0a0c0c5..de995eb 100644
--- a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
+++ b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
index a04d754..c022450 100644
--- a/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
+++ b/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/GenerateProposalsLayerNode.cpp b/src/graph/nodes/GenerateProposalsLayerNode.cpp
index dabfc5a..9f36862 100644
--- a/src/graph/nodes/GenerateProposalsLayerNode.cpp
+++ b/src/graph/nodes/GenerateProposalsLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/InputNode.cpp b/src/graph/nodes/InputNode.cpp
index 709eaae..072281f 100644
--- a/src/graph/nodes/InputNode.cpp
+++ b/src/graph/nodes/InputNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/NormalizationLayerNode.cpp b/src/graph/nodes/NormalizationLayerNode.cpp
index a7b3738..eaa1bcf 100644
--- a/src/graph/nodes/NormalizationLayerNode.cpp
+++ b/src/graph/nodes/NormalizationLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
index 129b380..113d0a5 100644
--- a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
+++ b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/OutputNode.cpp b/src/graph/nodes/OutputNode.cpp
index 8aa249b..dcdee88 100644
--- a/src/graph/nodes/OutputNode.cpp
+++ b/src/graph/nodes/OutputNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/PReluLayerNode.cpp b/src/graph/nodes/PReluLayerNode.cpp
index 9310aaf..378c18e 100644
--- a/src/graph/nodes/PReluLayerNode.cpp
+++ b/src/graph/nodes/PReluLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/PadLayerNode.cpp b/src/graph/nodes/PadLayerNode.cpp
index cbee134..6424370 100644
--- a/src/graph/nodes/PadLayerNode.cpp
+++ b/src/graph/nodes/PadLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/PermuteLayerNode.cpp b/src/graph/nodes/PermuteLayerNode.cpp
index 042ec09..b311ee1 100644
--- a/src/graph/nodes/PermuteLayerNode.cpp
+++ b/src/graph/nodes/PermuteLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/PoolingLayerNode.cpp b/src/graph/nodes/PoolingLayerNode.cpp
index b6f317a..4ecf924 100644
--- a/src/graph/nodes/PoolingLayerNode.cpp
+++ b/src/graph/nodes/PoolingLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/PrintLayerNode.cpp b/src/graph/nodes/PrintLayerNode.cpp
index 6a1a993..da408d8 100644
--- a/src/graph/nodes/PrintLayerNode.cpp
+++ b/src/graph/nodes/PrintLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/PriorBoxLayerNode.cpp b/src/graph/nodes/PriorBoxLayerNode.cpp
index edb1fba..f017ead 100644
--- a/src/graph/nodes/PriorBoxLayerNode.cpp
+++ b/src/graph/nodes/PriorBoxLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/QuantizationLayerNode.cpp b/src/graph/nodes/QuantizationLayerNode.cpp
index 009d701..db70c2c 100644
--- a/src/graph/nodes/QuantizationLayerNode.cpp
+++ b/src/graph/nodes/QuantizationLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/ROIAlignLayerNode.cpp b/src/graph/nodes/ROIAlignLayerNode.cpp
index 5e89ef2..6289181 100644
--- a/src/graph/nodes/ROIAlignLayerNode.cpp
+++ b/src/graph/nodes/ROIAlignLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/ReorgLayerNode.cpp b/src/graph/nodes/ReorgLayerNode.cpp
index 21ad451..e693e4b 100644
--- a/src/graph/nodes/ReorgLayerNode.cpp
+++ b/src/graph/nodes/ReorgLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/ReshapeLayer.cpp b/src/graph/nodes/ReshapeLayer.cpp
index 58610e9..a6354d0 100644
--- a/src/graph/nodes/ReshapeLayer.cpp
+++ b/src/graph/nodes/ReshapeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/ResizeLayerNode.cpp b/src/graph/nodes/ResizeLayerNode.cpp
index a399229..2a94bf6 100644
--- a/src/graph/nodes/ResizeLayerNode.cpp
+++ b/src/graph/nodes/ResizeLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/SliceLayerNode.cpp b/src/graph/nodes/SliceLayerNode.cpp
index bfc009d..3bd6752 100644
--- a/src/graph/nodes/SliceLayerNode.cpp
+++ b/src/graph/nodes/SliceLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/SoftmaxLayerNode.cpp b/src/graph/nodes/SoftmaxLayerNode.cpp
index 57e5561..fb907f4 100644
--- a/src/graph/nodes/SoftmaxLayerNode.cpp
+++ b/src/graph/nodes/SoftmaxLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/SplitLayerNode.cpp b/src/graph/nodes/SplitLayerNode.cpp
index 5d46c9d..31931c3 100644
--- a/src/graph/nodes/SplitLayerNode.cpp
+++ b/src/graph/nodes/SplitLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
@@ -31,8 +32,8 @@
 {
 namespace graph
 {
-SplitLayerNode::SplitLayerNode(unsigned int num_splits, unsigned int axis)
-    : _num_splits(num_splits), _axis(axis)
+SplitLayerNode::SplitLayerNode(unsigned int num_splits, int axis, std::vector<int> size_splits)
+    : _num_splits(num_splits), _axis(axis), _size_splits(size_splits)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(num_splits, NullTensorID);
@@ -49,15 +50,34 @@
 }
 
 std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                   unsigned int num_splits, unsigned int axis, unsigned int idx)
+                                                                                   unsigned int num_splits, int axis, unsigned int idx)
 {
-    const unsigned int split_size = input_descriptor.shape[axis] / num_splits;
-
+    // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
+    int              num_dimension = static_cast<int32_t>(input_descriptor.shape.num_dimensions());
+    int              tmp_axis      = wrap_around(axis, num_dimension);
+    Coordinates      coords;
     TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape.set(axis, split_size);
-
-    Coordinates coords;
-    coords.set(axis, idx * split_size);
+    int              split_size        = input_descriptor.shape[tmp_axis] / num_splits;
+    if(_size_splits.empty())
+    {
+        output_descriptor.shape.set(tmp_axis, split_size);
+        coords.set(tmp_axis, idx * split_size);
+    }
+    else
+    {
+        int split_size = _size_splits[idx];
+        if(split_size == -1)
+        {
+            split_size = input_descriptor.shape[tmp_axis];
+            for(unsigned int i = 0; i < _size_splits.size() - 1; ++i)
+                split_size -= _size_splits[i];
+        }
+        output_descriptor.shape.set(tmp_axis, split_size);
+        int coord_value = 0;
+        for(unsigned int i = 0; i < idx; ++i)
+            coord_value += _size_splits[i];
+        coords.set(tmp_axis, coord_value);
+    }
 
     return std::make_pair(output_descriptor, coords);
 }
@@ -89,18 +109,39 @@
     const Tensor *src = input(0);
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
-    TensorDescriptor output_info;
-    std::tie(output_info, std::ignore) = compute_output_descriptor(src->desc(), _num_splits, _axis, idx);
+    TensorDescriptor input_descriptor  = src->desc();
+    TensorDescriptor output_descriptor = input_descriptor;
 
-    return output_info;
+    // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
+    int num_dimension = static_cast<int32_t>(src->desc().shape.num_dimensions());
+    int tmp_axis      = wrap_around(_axis, num_dimension);
+
+    int split_size = (_size_splits.empty()) ? (input_descriptor.shape[tmp_axis] / _num_splits) : _size_splits[idx];
+    if(split_size == -1)
+    {
+        split_size = input_descriptor.shape[tmp_axis];
+        for(unsigned int i = 0; i < _size_splits.size() - 1; ++i)
+            split_size -= _size_splits[i];
+    }
+    output_descriptor.shape.set(tmp_axis, split_size);
+
+    return output_descriptor;
 }
 
 Status SplitLayerNode::validate() const
 {
     const Tensor *src = input(0);
     ARM_COMPUTE_RETURN_ERROR_ON(src == nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(_axis >= src->desc().shape.num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->desc().shape[_axis] % _num_splits, "Split should be exact");
+    int num_dimension = static_cast<int32_t>(src->desc().shape.num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(_axis < (-num_dimension) || _axis >= num_dimension);
+
+    // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
+    int tmp_axis = wrap_around(_axis, num_dimension);
+
+    if(_size_splits.empty())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->desc().shape[tmp_axis] % _num_splits, "Split should be exact");
+    }
 
     return Status{};
 }

diff --git a/src/graph/nodes/StackLayerNode.cpp b/src/graph/nodes/StackLayerNode.cpp
index d26498a..f292b33 100644
--- a/src/graph/nodes/StackLayerNode.cpp
+++ b/src/graph/nodes/StackLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/UpsampleLayerNode.cpp b/src/graph/nodes/UpsampleLayerNode.cpp
index 88af122..3f84217 100644
--- a/src/graph/nodes/UpsampleLayerNode.cpp
+++ b/src/graph/nodes/UpsampleLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/nodes/YOLOLayerNode.cpp b/src/graph/nodes/YOLOLayerNode.cpp
index cf1e576..b29dd03 100644
--- a/src/graph/nodes/YOLOLayerNode.cpp
+++ b/src/graph/nodes/YOLOLayerNode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index 4089088..2e1e9d0 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index bf21951..12478be 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index 6bbb731..08f46e5 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index 907b39f..88bb421 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index f50d100..3d38019 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLDistribution1D.cpp b/src/runtime/CL/CLDistribution1D.cpp
index f1dd95e..91d67cb 100644
--- a/src/runtime/CL/CLDistribution1D.cpp
+++ b/src/runtime/CL/CLDistribution1D.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLHOG.cpp b/src/runtime/CL/CLHOG.cpp
index c4ea639..6a02d8f 100644
--- a/src/runtime/CL/CLHOG.cpp
+++ b/src/runtime/CL/CLHOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index c4c7ee2..adfdc3c 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLLut.cpp b/src/runtime/CL/CLLut.cpp
index a8cbf21..eb9422c 100644
--- a/src/runtime/CL/CLLut.cpp
+++ b/src/runtime/CL/CLLut.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLLutAllocator.cpp b/src/runtime/CL/CLLutAllocator.cpp
index 311de4b..d690cf2 100644
--- a/src/runtime/CL/CLLutAllocator.cpp
+++ b/src/runtime/CL/CLLutAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index 557378b..efbc68f 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 7ae16ec..0952139 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLMultiHOG.cpp b/src/runtime/CL/CLMultiHOG.cpp
index 14cd68a..28a802c 100644
--- a/src/runtime/CL/CLMultiHOG.cpp
+++ b/src/runtime/CL/CLMultiHOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLMultiImage.cpp b/src/runtime/CL/CLMultiImage.cpp
index 92254f3..28b3f85 100644
--- a/src/runtime/CL/CLMultiImage.cpp
+++ b/src/runtime/CL/CLMultiImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/runtime/CL/CLOperator.cpp
similarity index 60%
copy from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
copy to src/runtime/CL/CLOperator.cpp
index 36f84d8..57a4d0e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/src/runtime/CL/CLOperator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,37 +21,36 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#pragma once
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
 
-#ifdef __aarch64__
+namespace arm_compute
+{
+namespace experimental
+{
+ICLOperator::ICLOperator(IRuntimeContext *ctx)
+    : _kernel(), _ctx(ctx), _workspace()
+{
+}
 
-namespace arm_gemm {
-
-// Actual kernel implementations
-void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
-
-// Transposed SGEMV strategy class.
-class sgemv_trans {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
-
-    /* Kernel blocking parameters */
-    static unsigned int out_width() {
-        return 96;
+void ICLOperator::run(ITensorPack &tensors)
+{
+    if(tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
     }
 
-    static unsigned int k_unroll() {
-        return 1;
-    }
+    CLScheduler::get().enqueue_op(*_kernel.get(), tensors, false);
+}
 
-    kern_type kernel=a64_sgemv_trans;
+void ICLOperator::prepare(ITensorPack &constants)
+{
+    ARM_COMPUTE_UNUSED(constants);
+}
 
-    sgemv_trans(const CPUInfo *ci) { UNUSED(ci); }
-};
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
+MemoryRequirements ICLOperator::workspace() const
+{
+    return {};
+}
+} // namespace experimental
+} // namespace arm_compute

diff --git a/src/runtime/CL/CLPyramid.cpp b/src/runtime/CL/CLPyramid.cpp
index ef8a1e5..dfa542e 100644
--- a/src/runtime/CL/CLPyramid.cpp
+++ b/src/runtime/CL/CLPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
index 4d70eda..2fc7f93 100644
--- a/src/runtime/CL/CLRuntimeContext.cpp
+++ b/src/runtime/CL/CLRuntimeContext.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index e78eaa4..ccef5cb 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -151,25 +151,37 @@
     _cl_tuner       = cl_tuner;
 }
 
-void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
+void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
                              "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
                              or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
 
+    const bool inject_memory = !tensors.empty();
+
     // Tune the kernel if the CLTuner has been provided
     if(_cl_tuner != nullptr)
     {
-        // Tune the OpenCL kernel
-        _cl_tuner->tune_kernel_dynamic(kernel);
+        inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel);
     }
 
     // Run kernel
-    kernel.run(kernel.window(), _queue);
+    inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
 
     if(flush)
     {
         _queue.flush();
     }
 }
+
+void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
+{
+    ITensorPack pack;
+    enqueue_common(kernel, pack, flush);
+}
+
+void CLScheduler::enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush)
+{
+    enqueue_common(kernel, tensors, flush);
+}
 } // namespace arm_compute

diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index 0f36250..d082241 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLTensor.cpp b/src/runtime/CL/CLTensor.cpp
index a6d0cf7..db94639 100644
--- a/src/runtime/CL/CLTensor.cpp
+++ b/src/runtime/CL/CLTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index eaf46d4..90d7788 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 5f2fa7d..adfe67f 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,12 @@
 
 void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
 {
+    ITensorPack pack;
+    tune_kernel_dynamic(kernel, pack);
+}
+
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
+{
     // Get the configuration ID from the kernel and append GPU target name and number of available compute units
     const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
 
@@ -90,7 +96,7 @@
             if(_tune_new_kernels)
             {
                 // Find the optimal LWS for the kernel
-                cl::NDRange opt_lws = find_optimal_lws(kernel);
+                cl::NDRange opt_lws = find_optimal_lws(kernel, tensors);
 
                 // Insert the optimal LWS in the table
                 add_lws_to_table(config_id, opt_lws);
@@ -112,7 +118,7 @@
     _lws_table.emplace(kernel_id, optimal_lws);
 }
 
-cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
+cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel, ITensorPack &tensors)
 {
     // Profiling queue
     cl::CommandQueue queue_profiler;
@@ -167,7 +173,8 @@
     cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
 
     // Run the kernel with default lws to be used as baseline
-    kernel.run(kernel.window(), queue_profiler);
+    const bool inject_memory = !tensors.empty();
+    inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
 
     queue_profiler.finish();
 
@@ -178,7 +185,7 @@
 
     cl::NDRange opt_lws = cl::NullRange;
 
-    //Construct the list of LWS values to be tested based on the tuner mode.
+    // Construct the list of LWS values to be tested based on the tuner mode.
     auto lws_list = cl_tuner::CLLWSListFactory::get_lws_list(_tuner_mode, gws);
     for(size_t i = 0; i < lws_list->size(); ++i)
     {
@@ -197,7 +204,7 @@
         kernel.set_lws_hint(lws_test);
 
         // Run the kernel
-        kernel.run(kernel.window(), queue_profiler);
+        inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
 
         queue_profiler.finish();
 

diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index fb8eba8..b00ad5e 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/TracePoint.cpp b/src/runtime/CL/TracePoint.cpp
index 97029f5..9991424 100644
--- a/src/runtime/CL/TracePoint.cpp
+++ b/src/runtime/CL/TracePoint.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp
index 5e22dfd..e04ce84 100644
--- a/src/runtime/CL/Utils.cpp
+++ b/src/runtime/CL/Utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLAbsoluteDifference.cpp b/src/runtime/CL/functions/CLAbsoluteDifference.cpp
index 492c54e..d5d1bbd 100644
--- a/src/runtime/CL/functions/CLAbsoluteDifference.cpp
+++ b/src/runtime/CL/functions/CLAbsoluteDifference.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLAccumulate.cpp b/src/runtime/CL/functions/CLAccumulate.cpp
index a81d1d0..2f06252 100644
--- a/src/runtime/CL/functions/CLAccumulate.cpp
+++ b/src/runtime/CL/functions/CLAccumulate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index 989603a..5ddf227 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
@@ -30,11 +31,41 @@
 
 namespace arm_compute
 {
-CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
-    : ICLSimpleFunction(ctx)
+namespace experimental
 {
+void CLActivation::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *output, ActivationLayerInfo act_info)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
+    k->configure(compile_context, input, output, act_info);
+    _kernel = std::move(k);
 }
 
+Status CLActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return CLActivationLayerKernel::validate(input, output, act_info);
+}
+} // namespace experimental
+
+struct CLActivationLayer::Impl
+{
+    const ICLTensor                            *src{ nullptr };
+    ICLTensor                                  *dst{ nullptr };
+    CLRuntimeContext                           *ctx{ nullptr };
+    std::unique_ptr<experimental::CLActivation> op{ nullptr };
+};
+
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+    _impl->ctx = ctx;
+}
+
+CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default;
+
+CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default;
+
+CLActivationLayer::~CLActivationLayer() = default;
+
 void CLActivationLayer::configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
@@ -42,13 +73,25 @@
 
 void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLActivationLayerKernel>();
-    k->configure(compile_context, input, output, act_info);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _impl->src = input;
+    _impl->dst = output == nullptr ? input : output;
+
+    _impl->op = arm_compute::support::cpp14::make_unique<experimental::CLActivation>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info);
 }
 
 Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    return CLActivationLayerKernel::validate(input, output, act_info);
+    return experimental::CLActivation::validate(input, output, act_info);
+}
+
+void CLActivationLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index 5b4c694..ad6e7ba 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
 
+#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
@@ -34,13 +35,15 @@
 namespace arm_compute
 {
 CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), _reduction_kernels_vector(), _reshape_kernel(), _num_of_stages(), _reduction_axis()
+    : _memory_group(std::move(memory_manager)), _results_vector(), _not_reshaped_output(), _reduction_kernels_vector(), _reshape(), _num_of_stages(), _reduction_axis()
 {
 }
 
 Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
@@ -100,7 +103,7 @@
         const unsigned int last_stage = num_of_stages - 1;
         ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernel::validate(input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&not_reshaped_output, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&not_reshaped_output, output));
     return Status{};
 }
 
@@ -155,7 +158,7 @@
         _reduction_kernels_vector[last_stage].configure(compile_context, input, &_results_vector[last_stage - 1], &_not_reshaped_output, axis, op);
         _results_vector[last_stage - 1].allocator()->allocate();
     }
-    _reshape_kernel.configure(compile_context, &_not_reshaped_output, output);
+    _reshape.configure(compile_context, &_not_reshaped_output, output);
     _not_reshaped_output.allocator()->allocate();
 }
 
@@ -167,6 +170,6 @@
     {
         CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
     }
-    CLScheduler::get().enqueue(_reshape_kernel, false);
+    _reshape.run();
 }
 } // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index 9fc5113..701add0 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index 0a2ae2a..5ba3b5b 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index 1fa80f0..cb49e61 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 4659519..22c575c 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index 8431140..4bbb890 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index 0e0e7f2..bc37f6e 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 55bcde7..2384fc4 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLBox3x3.cpp b/src/runtime/CL/functions/CLBox3x3.cpp
index 72c8221..0300899 100644
--- a/src/runtime/CL/functions/CLBox3x3.cpp
+++ b/src/runtime/CL/functions/CLBox3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp
index 0c8d353..cd2d6b4 100644
--- a/src/runtime/CL/functions/CLCannyEdge.cpp
+++ b/src/runtime/CL/functions/CLCannyEdge.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 7048a79..95cc0e9 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLChannelCombine.cpp b/src/runtime/CL/functions/CLChannelCombine.cpp
index 249212e..326caa8 100644
--- a/src/runtime/CL/functions/CLChannelCombine.cpp
+++ b/src/runtime/CL/functions/CLChannelCombine.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLChannelExtract.cpp b/src/runtime/CL/functions/CLChannelExtract.cpp
index 019e0a7..aa37af9 100644
--- a/src/runtime/CL/functions/CLChannelExtract.cpp
+++ b/src/runtime/CL/functions/CLChannelExtract.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index 93ab7c7..b79afdb 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLColorConvert.cpp b/src/runtime/CL/functions/CLColorConvert.cpp
index b8e5977..2bbb30e 100644
--- a/src/runtime/CL/functions/CLColorConvert.cpp
+++ b/src/runtime/CL/functions/CLColorConvert.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 8d5ec35..8c18b35 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLComputeAllAnchors.cpp b/src/runtime/CL/functions/CLComputeAllAnchors.cpp
index 62714fe..be86fc4 100644
--- a/src/runtime/CL/functions/CLComputeAllAnchors.cpp
+++ b/src/runtime/CL/functions/CLComputeAllAnchors.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index e972567..4214813 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,61 +40,32 @@
 
 namespace arm_compute
 {
-CLConcatenateLayer::CLConcatenateLayer()
+namespace experimental
+{
+CLConcatenation::CLConcatenation()
     : _concat_kernels(),
       _num_inputs(0),
       _axis(Window::DimX)
 {
 }
 
-void CLConcatenateLayer::configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
-}
-
-void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
-{
-    configure_internal(compile_context, std::move(inputs_vector), output, axis);
-}
-
-void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
-{
-    configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
-}
-
-void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
-{
-    configure_internal(compile_context, std::move(inputs_vector), output, axis);
-}
-
-Status CLConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
-{
-    return validate_internal(inputs_vector, output, axis);
-}
-
-Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
-{
-    return validate_internal(inputs_vector, output, axis);
-}
-
-template <typename TensorType>
-void CLConcatenateLayer::configure_internal(const CLCompileContext &compile_context, std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenation::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &inputs_vector, ITensorInfo *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     _axis       = axis;
     _num_inputs = inputs_vector.size();
 
-    std::vector<ITensorInfo *> inputs_vector_info(inputs_vector.size());
-    std::transform(inputs_vector.begin(), inputs_vector.end(), inputs_vector_info.begin(), [](TensorType * t)
+    TensorShape                      output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+    std::vector<const ITensorInfo *> const_inputs_vector(inputs_vector.size());
+    std::transform(inputs_vector.begin(), inputs_vector.end(), const_inputs_vector.begin(), [](ITensorInfo * t)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
+        return t;
     });
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
+    auto_init_if_empty(*output, output_shape, 1, inputs_vector[0]->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(CLConcatenateLayer::validate(const_inputs_vector, output, axis));
 
     unsigned int offset = 0;
     switch(_axis)
@@ -126,7 +97,7 @@
                     {
                         auto kernel = support::cpp14::make_unique<CLWidthConcatenateLayerKernel>();
                         kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                        offset += inputs_vector.at(i)->info()->dimension(_axis);
+                        offset += inputs_vector.at(i)->dimension(_axis);
                         _concat_kernels.emplace_back(std::move(kernel));
                     }
                     break;
@@ -140,7 +111,7 @@
             {
                 auto kernel = support::cpp14::make_unique<CLHeightConcatenateLayerKernel>();
                 kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->info()->dimension(_axis);
+                offset += inputs_vector.at(i)->dimension(_axis);
                 _concat_kernels.emplace_back(std::move(kernel));
             }
             break;
@@ -151,7 +122,7 @@
             {
                 auto kernel = support::cpp14::make_unique<CLDepthConcatenateLayerKernel>();
                 kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->info()->dimension(_axis);
+                offset += inputs_vector.at(i)->dimension(_axis);
                 _concat_kernels.emplace_back(std::move(kernel));
             }
             break;
@@ -162,7 +133,7 @@
             {
                 auto kernel = support::cpp14::make_unique<CLBatchConcatenateLayerKernel>();
                 kernel->configure(compile_context, inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->info()->dimension(_axis);
+                offset += inputs_vector.at(i)->dimension(_axis);
                 _concat_kernels.emplace_back(std::move(kernel));
             }
             break;
@@ -172,8 +143,7 @@
     }
 }
 
-template <typename TensorInfoType>
-Status CLConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status CLConcatenation::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(output == nullptr);
     const unsigned int num_inputs = inputs_vector.size();
@@ -250,11 +220,96 @@
     return Status{};
 }
 
+void CLConcatenation::run(ITensorPack &tensors)
+{
+    if(tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
+    {
+        ARM_COMPUTE_ERROR("Configured with different number of inputs");
+    }
+
+    if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
+    {
+        ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
+        CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
+    }
+    else
+    {
+        int i = 0;
+        for(auto &k : _concat_kernels)
+        {
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
+            pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
+            CLScheduler::get().enqueue_op(*k, pack, true);
+            ++i;
+        }
+    }
+}
+} // namespace experimental
+
+struct CLConcatenateLayer::Impl
+{
+    std::vector<const ICLTensor *>                 srcs{};
+    ICLTensor                                     *dst{ nullptr };
+    unsigned int                                   num_inputs{ 0 };
+    unsigned int                                   axis{ 0 };
+    std::unique_ptr<experimental::CLConcatenation> op{ nullptr };
+};
+
+CLConcatenateLayer::CLConcatenateLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+CLConcatenateLayer::CLConcatenateLayer(CLConcatenateLayer &&) = default;
+
+CLConcatenateLayer &CLConcatenateLayer::operator=(CLConcatenateLayer &&) = default;
+
+CLConcatenateLayer::~CLConcatenateLayer() = default;
+
+void CLConcatenateLayer::configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
+}
+
+void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+{
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    _impl->srcs       = inputs_vector;
+    _impl->dst        = output;
+    _impl->axis       = axis;
+    _impl->num_inputs = inputs_vector.size();
+    _impl->op         = arm_compute::support::cpp14::make_unique<experimental::CLConcatenation>();
+
+    std::vector<ITensorInfo *> inputs_vector_info;
+    for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
+        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+    }
+    _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis);
+}
+
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+    return experimental::CLConcatenation::validate(inputs_vector, output, axis);
+}
+
 void CLConcatenateLayer::run()
 {
-    for(auto &kernel : _concat_kernels)
+    ITensorPack pack;
+    for(unsigned i = 0; i < _impl->num_inputs; ++i)
     {
-        CLScheduler::get().enqueue(*kernel, true);
+        pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index 68c0fb6..4c78767 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLConvolution.cpp b/src/runtime/CL/functions/CLConvolution.cpp
index 2b0d7d5..bc962d0 100644
--- a/src/runtime/CL/functions/CLConvolution.cpp
+++ b/src/runtime/CL/functions/CLConvolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index b6e1413..630352e 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 4c5d62a..acdc52d 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index 17fc80e..529f7bf 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -135,7 +135,7 @@
         configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index);
 
         auto scale_kernel = support::cpp14::make_unique<CLScale>();
-        scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT);
+        scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT });
         _scale.emplace_back(std::move(scale_kernel));
 
         Window win = calculate_max_window(*_output->info());

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 62e7d9a..cd55336 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index be2d120..eb1fb7f 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index b848f98..141eb3f 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 89e5faa..8571056 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index b1e9fe7..bb0db2e 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 362b36c..66ac58e 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDerivative.cpp b/src/runtime/CL/functions/CLDerivative.cpp
index 68d3752..7138281 100644
--- a/src/runtime/CL/functions/CLDerivative.cpp
+++ b/src/runtime/CL/functions/CLDerivative.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDilate.cpp b/src/runtime/CL/functions/CLDilate.cpp
index 05351a9..27acf9f 100644
--- a/src/runtime/CL/functions/CLDilate.cpp
+++ b/src/runtime/CL/functions/CLDilate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 6e9782f..c1055dd 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index da16bed..3515c25 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
index ce61532..de94255 100644
--- a/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementWiseUnaryLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,109 @@
 
 namespace arm_compute
 {
+namespace experimental
+{
+void CLRsqrt::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(compile_context, input, output, ElementWiseUnary::RSQRT);
+    _kernel = std::move(k);
+}
+
+Status CLRsqrt::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return arm_compute::CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::RSQRT);
+}
+
+void CLExp::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(compile_context, input, output, ElementWiseUnary::EXP);
+    _kernel = std::move(k);
+}
+
+Status CLExp::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return arm_compute::CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
+}
+
+void CLNeg::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(compile_context, input, output, ElementWiseUnary::NEG);
+    _kernel = std::move(k);
+}
+
+Status CLNeg::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return arm_compute::CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::NEG);
+}
+
+void CLSin::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(compile_context, input, output, ElementWiseUnary::SIN);
+    _kernel = std::move(k);
+}
+
+Status CLSin::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return arm_compute::CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::SIN);
+}
+
+void CLAbs::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(compile_context, input, output, ElementWiseUnary::ABS);
+    _kernel = std::move(k);
+}
+
+Status CLAbs::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return arm_compute::CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ABS);
+}
+
+void CLLog::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(compile_context, input, output, ElementWiseUnary::LOG);
+    _kernel = std::move(k);
+}
+
+Status CLLog::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return arm_compute::CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::LOG);
+}
+
+void CLRound::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
+    k->configure(compile_context, input, output, ElementWiseUnary::ROUND);
+    _kernel = std::move(k);
+}
+
+Status CLRound::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return arm_compute::CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ROUND);
+}
+} // namespace experimental
+
+struct CLRsqrtLayer::Impl
+{
+    const ICLTensor                       *src{ nullptr };
+    ICLTensor                             *dst{ nullptr };
+    std::unique_ptr<experimental::CLRsqrt> op{ nullptr };
+};
+
+CLRsqrtLayer::CLRsqrtLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default;
+CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default;
+CLRsqrtLayer::~CLRsqrtLayer()                          = default;
+
 void CLRsqrtLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -37,15 +140,41 @@
 
 void CLRsqrtLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::RSQRT);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLRsqrt>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
+
 Status CLRsqrtLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::RSQRT);
+    return experimental::CLRsqrt::validate(input, output);
 }
 
+void CLRsqrtLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLExpLayer::Impl
+{
+    const ICLTensor                     *src{ nullptr };
+    ICLTensor                           *dst{ nullptr };
+    std::unique_ptr<experimental::CLExp> op{ nullptr };
+};
+
+CLExpLayer::CLExpLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+CLExpLayer::CLExpLayer(CLExpLayer &&) = default;
+CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default;
+CLExpLayer::~CLExpLayer()                        = default;
+
 void CLExpLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -53,15 +182,41 @@
 
 void CLExpLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::EXP);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLExp>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
+
 Status CLExpLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::EXP);
+    return experimental::CLExp::validate(input, output);
 }
 
+void CLExpLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLNegLayer::Impl
+{
+    const ICLTensor                     *src{ nullptr };
+    ICLTensor                           *dst{ nullptr };
+    std::unique_ptr<experimental::CLNeg> op{ nullptr };
+};
+
+CLNegLayer::CLNegLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+CLNegLayer::CLNegLayer(CLNegLayer &&) = default;
+CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default;
+CLNegLayer::~CLNegLayer()                        = default;
+
 void CLNegLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -69,15 +224,40 @@
 
 void CLNegLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::NEG);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLNeg>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
 Status CLNegLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::NEG);
+    return experimental::CLNeg::validate(input, output);
 }
 
+void CLNegLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLSinLayer::Impl
+{
+    const ICLTensor                     *src{ nullptr };
+    ICLTensor                           *dst{ nullptr };
+    std::unique_ptr<experimental::CLSin> op{ nullptr };
+};
+
+CLSinLayer::CLSinLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+CLSinLayer::CLSinLayer(CLSinLayer &&) = default;
+CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default;
+CLSinLayer::~CLSinLayer()                        = default;
+
 void CLSinLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -85,15 +265,40 @@
 
 void CLSinLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::SIN);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLSin>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
 Status CLSinLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::SIN);
+    return experimental::CLSin::validate(input, output);
 }
 
+void CLSinLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLAbsLayer::Impl
+{
+    const ICLTensor                     *src{ nullptr };
+    ICLTensor                           *dst{ nullptr };
+    std::unique_ptr<experimental::CLAbs> op{ nullptr };
+};
+
+CLAbsLayer::CLAbsLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default;
+CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default;
+CLAbsLayer::~CLAbsLayer()                        = default;
+
 void CLAbsLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -101,14 +306,40 @@
 
 void CLAbsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::ABS);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLAbs>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
 Status CLAbsLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ABS);
+    return experimental::CLAbs::validate(input, output);
 }
+
+void CLAbsLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLLogLayer::Impl
+{
+    const ICLTensor                     *src{ nullptr };
+    ICLTensor                           *dst{ nullptr };
+    std::unique_ptr<experimental::CLLog> op{ nullptr };
+};
+
+CLLogLayer::CLLogLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+CLLogLayer::CLLogLayer(CLLogLayer &&) = default;
+CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default;
+CLLogLayer::~CLLogLayer()                        = default;
+
 void CLLogLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -116,15 +347,40 @@
 
 void CLLogLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::LOG);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLLog>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
 Status CLLogLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::LOG);
+    return experimental::CLLog::validate(input, output);
 }
 
+void CLLogLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct CLRoundLayer::Impl
+{
+    const ICLTensor                       *src{ nullptr };
+    ICLTensor                             *dst{ nullptr };
+    std::unique_ptr<experimental::CLRound> op{ nullptr };
+};
+
+CLRoundLayer::CLRoundLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default;
+CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default;
+CLRoundLayer::~CLRoundLayer()                          = default;
+
 void CLRoundLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
@@ -132,13 +388,21 @@
 
 void CLRoundLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLElementWiseUnaryLayerKernel>();
-    k->configure(compile_context, input, output, ElementWiseUnary::ROUND);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLRound>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
 Status CLRoundLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLElementWiseUnaryLayerKernel::validate(input, output, ElementWiseUnary::ROUND);
+    return experimental::CLRound::validate(input, output);
 }
 
+void CLRoundLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 20e9545..6f66472 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
@@ -33,26 +34,45 @@
 {
 namespace
 {
-void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ITensorInfo *input1, ITensorInfo *input2, const ITensorInfo *output)
 {
-    if(output->info()->dimension(0) > 1)
+    if(output->dimension(0) > 1)
     {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+        ITensorInfo *broadcasted_info = (input1->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if(broadcasted_info->dimension(0) == 1)
         {
             border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
         }
     }
 }
+
+ITensorPack select_border_input(ITensorPack &tensors)
+{
+    ITensorPack pack;
+    if(tensors.get_tensor(TensorType::ACL_DST)->info()->dimension(0) > 1)
+    {
+        if(tensors.get_const_tensor(TensorType::ACL_SRC_1)->info()->dimension(0) == 1)
+        {
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1));
+        }
+        else
+        {
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_0));
+        }
+    }
+    return pack;
+}
 } // namespace
 
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+namespace experimental
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
+CLArithmeticAddition::CLArithmeticAddition()
+    : _border_handler()
+{
 }
 
-void CLArithmeticAddition::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
@@ -65,12 +85,19 @@
     return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticSubtraction::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::run(ITensorPack &tensors)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
+    auto border_pack = select_border_input(tensors);
+    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    ICLOperator::run(tensors);
 }
 
-void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+CLArithmeticSubtraction::CLArithmeticSubtraction()
+    : _border_handler()
+{
+}
+void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy,
+                                        const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLSaturatedArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
@@ -84,12 +111,19 @@
     return CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticSubtraction::run(ITensorPack &tensors)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+    auto border_pack = select_border_input(tensors);
+    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    ICLOperator::run(tensors);
 }
 
-void CLArithmeticDivision::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+CLArithmeticDivision::CLArithmeticDivision()
+    : _border_handler()
+{
+}
+
+void CLArithmeticDivision::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::DIV, input1, input2, output, act_info);
@@ -102,12 +136,19 @@
     return CLArithmeticOperationKernel::validate(ArithmeticOperation::DIV, input1, input2, output, act_info);
 }
 
-void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::run(ITensorPack &tensors)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+    auto border_pack = select_border_input(tensors);
+    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    ICLOperator::run(tensors);
 }
 
-void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+CLElementwiseMax::CLElementwiseMax()
+    : _border_handler()
+{
+}
+
+void CLElementwiseMax::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::MAX, input1, input2, output, act_info);
@@ -120,12 +161,19 @@
     return CLArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output, act_info);
 }
 
-void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::run(ITensorPack &tensors)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+    auto border_pack = select_border_input(tensors);
+    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    ICLOperator::run(tensors);
 }
 
-void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+CLElementwiseMin::CLElementwiseMin()
+    : _border_handler()
+{
+}
+
+void CLElementwiseMin::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::MIN, input1, input2, output, act_info);
@@ -138,12 +186,19 @@
     return CLArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output, act_info);
 }
 
-void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::run(ITensorPack &tensors)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+    auto border_pack = select_border_input(tensors);
+    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    ICLOperator::run(tensors);
 }
 
-void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
+    : _border_handler()
+{
+}
+
+void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
@@ -156,12 +211,19 @@
     return CLArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output, act_info);
 }
 
-void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::run(ITensorPack &tensors)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+    auto border_pack = select_border_input(tensors);
+    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    ICLOperator::run(tensors);
 }
 
-void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+CLElementwisePower::CLElementwisePower()
+    : _border_handler()
+{
+}
+
+void CLElementwisePower::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::POWER, input1, input2, output, act_info);
@@ -174,4 +236,329 @@
     return CLArithmeticOperationKernel::validate(ArithmeticOperation::POWER, input1, input2, output, act_info);
 }
 
+void CLElementwisePower::run(ITensorPack &tensors)
+{
+    auto border_pack = select_border_input(tensors);
+    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    ICLOperator::run(tensors);
+}
+} // namespace experimental
+
+struct CLArithmeticAddition::Impl
+{
+    const ICLTensor                                    *src_0{ nullptr };
+    const ICLTensor                                    *src_1{ nullptr };
+    ICLTensor                                          *dst{ nullptr };
+    std::unique_ptr<experimental::CLArithmeticAddition> op{ nullptr };
+};
+
+CLArithmeticAddition::CLArithmeticAddition()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default;
+CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default;
+CLArithmeticAddition::~CLArithmeticAddition()                                  = default;
+
+void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
+}
+
+void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+                                     const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLArithmeticAddition>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
+}
+
+Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    return experimental::CLArithmeticAddition::validate(input1, input2, output, policy, act_info);
+}
+
+void CLArithmeticAddition::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+struct CLArithmeticSubtraction::Impl
+{
+    const ICLTensor                                       *src_0{ nullptr };
+    const ICLTensor                                       *src_1{ nullptr };
+    ICLTensor                                             *dst{ nullptr };
+    std::unique_ptr<experimental::CLArithmeticSubtraction> op{ nullptr };
+};
+
+CLArithmeticSubtraction::CLArithmeticSubtraction()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default;
+CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default;
+CLArithmeticSubtraction::~CLArithmeticSubtraction()                                     = default;
+
+void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
+}
+
+void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+                                        const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLArithmeticSubtraction>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
+}
+
+Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    return experimental::CLArithmeticSubtraction::validate(input1, input2, output, policy, act_info);
+}
+
+void CLArithmeticSubtraction::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+struct CLArithmeticDivision::Impl
+{
+    const ICLTensor                                    *src_0{ nullptr };
+    const ICLTensor                                    *src_1{ nullptr };
+    ICLTensor                                          *dst{ nullptr };
+    std::unique_ptr<experimental::CLArithmeticDivision> op{ nullptr };
+};
+
+CLArithmeticDivision::CLArithmeticDivision()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default;
+CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default;
+CLArithmeticDivision::~CLArithmeticDivision()                                  = default;
+
+void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLArithmeticDivision>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return experimental::CLArithmeticDivision::validate(input1, input2, output, act_info);
+}
+
+void CLArithmeticDivision::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+struct CLElementwiseMax::Impl
+{
+    const ICLTensor                                *src_0{ nullptr };
+    const ICLTensor                                *src_1{ nullptr };
+    ICLTensor                                      *dst{ nullptr };
+    std::unique_ptr<experimental::CLElementwiseMax> op{ nullptr };
+};
+
+CLElementwiseMax::CLElementwiseMax()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default;
+CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default;
+CLElementwiseMax::~CLElementwiseMax()                              = default;
+
+void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLElementwiseMax>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return experimental::CLElementwiseMax::validate(input1, input2, output, act_info);
+}
+
+void CLElementwiseMax::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+struct CLElementwiseMin::Impl
+{
+    const ICLTensor                                *src_0{ nullptr };
+    const ICLTensor                                *src_1{ nullptr };
+    ICLTensor                                      *dst{ nullptr };
+    std::unique_ptr<experimental::CLElementwiseMin> op{ nullptr };
+};
+
+CLElementwiseMin::CLElementwiseMin()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default;
+CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default;
+CLElementwiseMin::~CLElementwiseMin()                              = default;
+
+void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLElementwiseMin>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return experimental::CLElementwiseMin::validate(input1, input2, output, act_info);
+}
+
+void CLElementwiseMin::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+struct CLElementwiseSquaredDiff::Impl
+{
+    const ICLTensor                                        *src_0{ nullptr };
+    const ICLTensor                                        *src_1{ nullptr };
+    ICLTensor                                              *dst{ nullptr };
+    std::unique_ptr<experimental::CLElementwiseSquaredDiff> op{ nullptr };
+    std::unique_ptr<CLFillBorderKernel>                     _border_handler{ nullptr };
+};
+
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default;
+CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default;
+CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff()                                      = default;
+
+void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLElementwiseSquaredDiff>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return experimental::CLElementwiseSquaredDiff::validate(input1, input2, output, act_info);
+}
+
+void CLElementwiseSquaredDiff::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+struct CLElementwisePower::Impl
+{
+    const ICLTensor                                  *src_0{ nullptr };
+    const ICLTensor                                  *src_1{ nullptr };
+    ICLTensor                                        *dst{ nullptr };
+    std::unique_ptr<experimental::CLElementwisePower> op{ nullptr };
+};
+
+CLElementwisePower::CLElementwisePower()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default;
+CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default;
+CLElementwisePower::~CLElementwisePower()                                = default;
+
+void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLElementwisePower>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return experimental::CLElementwisePower::validate(input1, input2, output, act_info);
+}
+
+void CLElementwisePower::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLEqualizeHistogram.cpp b/src/runtime/CL/functions/CLEqualizeHistogram.cpp
index e1bd7e6..a1158a7 100644
--- a/src/runtime/CL/functions/CLEqualizeHistogram.cpp
+++ b/src/runtime/CL/functions/CLEqualizeHistogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLErode.cpp b/src/runtime/CL/functions/CLErode.cpp
index 8106148..5236f62 100644
--- a/src/runtime/CL/functions/CLErode.cpp
+++ b/src/runtime/CL/functions/CLErode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index c3922f5..7d15d33 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index 2482ea9..7ab852f 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index ff439cc..1def674 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLFastCorners.cpp b/src/runtime/CL/functions/CLFastCorners.cpp
index f51abf0..97f853f 100644
--- a/src/runtime/CL/functions/CLFastCorners.cpp
+++ b/src/runtime/CL/functions/CLFastCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 7b96ed1..6c0f178 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLFillBorder.cpp b/src/runtime/CL/functions/CLFillBorder.cpp
index f9d7396..c647bb6 100644
--- a/src/runtime/CL/functions/CLFillBorder.cpp
+++ b/src/runtime/CL/functions/CLFillBorder.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 9a247cc..a826541 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 44e1d39..7ed92ac 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index ecbac6f..4f365b6 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 6deecdc..825267c 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 8466024..4a74630 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -291,8 +291,16 @@
     std::unique_ptr<ICLGEMMKernelConfiguration> gemm_config = CLGEMMReshapedOnlyRHSKernelConfigurationFactory::create(gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
 
+    unsigned int m_internal = m;
+    unsigned int b_internal = batch_size;
+    if(reinterpret_input_as_3d)
+    {
+        m_internal = a->info()->dimension(1);
+        b_internal = a->info()->dimension(2);
+    }
+
     // Configure lhs_info and rhs_info
-    std::tie(lhs_info, rhs_info) = gemm_config->configure(m, n, k, batch_size, data_type);
+    std::tie(lhs_info, rhs_info) = gemm_config->configure(m_internal, n, k, b_internal, data_type);
 
     ICLTensor *reshaped_rhs = &_tmp_b;
     if(_weights_manager && _weights_manager->are_weights_managed(b))

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 1c37993..ee90b39 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -312,7 +312,6 @@
         shape_gemm.set(0, mat_weights_cols);
         shape_gemm.set(1, conv_w * conv_h);
 
-        // TODO(COMPMID-2078): input->clone() doesn't work with subtensors for grouped convolutions.
         TensorInfo info_gemm(shape_gemm, 1, data_type);
         info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
         _gemm_output.allocator()->init(info_gemm);
@@ -418,11 +417,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
 
-    if(is_quantized_per_channel)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() != DataType::QASYMM8, "Input data type not compatible with Weights");
-    }
-    else
+    if(!is_quantized_per_channel)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }

diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 1dcb341..5fc9c17 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 84da4a7..30dce5b 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,7 +117,7 @@
     _output                      = output;
 
     _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->info()->data_type()) && is_data_type_quantized_symmetric(b->info()->data_type())
-                          && is_data_type_quantized_asymmetric(a->info()->data_type());
+                          && a->info()->data_type() == DataType::QASYMM8;
     _b_offset = _convert_to_qasymm8 ? -128 : b->info()->quantization_info().uniform().offset;
 
     // Get the GPU target

diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 9ae5d51..a499e18 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,45 +33,6 @@
 
 namespace arm_compute
 {
-void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_offset         = result_offset;
-    info.gemmlowp_multiplier     = result_mult_int;
-    info.gemmlowp_shift          = result_shift;
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
-    k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset,
-                                                        int result_mult_int,
-                                                        int result_shift, int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_offset         = result_offset;
-    info.gemmlowp_multiplier     = result_mult_int;
-    info.gemmlowp_shift          = result_shift;
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleKernel>();
-    k->configure(compile_context, input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-
-    return CLGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
-}
-
 void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
                                                                     int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
                                                                     int min, int max)
@@ -118,45 +79,6 @@
     return CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(input, bias, output, min, max);
 }
 
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                               float multiplier, int offset,
-                                                               int min, int max)
-{
-    GEMMLowpOutputStageInfo info  = GEMMLowpOutputStageInfo();
-    info.gemmlowp_offset          = offset;
-    info.gemmlowp_real_multiplier = multiplier;
-    info.gemmlowp_min_bound       = min;
-    info.gemmlowp_max_bound       = max;
-
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
-    k->configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
-                                                               float multiplier, int offset,
-                                                               int min, int max)
-{
-    GEMMLowpOutputStageInfo info  = GEMMLowpOutputStageInfo();
-    info.gemmlowp_offset          = offset;
-    info.gemmlowp_real_multiplier = multiplier;
-    info.gemmlowp_min_bound       = min;
-    info.gemmlowp_max_bound       = max;
-
-    auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel>();
-    k->configure(compile_context, input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-    return CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel::validate(input, bias, output, &info);
-}
-
 void CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
                                                                     int result_fixedpoint_multiplier, int result_shift,
                                                                     int min, int max)
@@ -268,4 +190,4 @@
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index e2b18e0..d9b6679 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLGaussian3x3.cpp b/src/runtime/CL/functions/CLGaussian3x3.cpp
index 47367c4..c62e200 100644
--- a/src/runtime/CL/functions/CLGaussian3x3.cpp
+++ b/src/runtime/CL/functions/CLGaussian3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLGaussian5x5.cpp b/src/runtime/CL/functions/CLGaussian5x5.cpp
index 6b82cd0..1fe2fdd 100644
--- a/src/runtime/CL/functions/CLGaussian5x5.cpp
+++ b/src/runtime/CL/functions/CLGaussian5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLGaussianPyramid.cpp b/src/runtime/CL/functions/CLGaussianPyramid.cpp
index 1ac9878..297d535 100644
--- a/src/runtime/CL/functions/CLGaussianPyramid.cpp
+++ b/src/runtime/CL/functions/CLGaussianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -175,7 +175,7 @@
             _gauss5x5[i].configure(compile_context, _pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
             /* Configure scale image kernel */
-            _scale_nearest[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, SamplingPolicy::CENTER);
+            _scale_nearest[i].configure(compile_context, _tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, PixelValue(), SamplingPolicy::CENTER });
         }
 
         _tmp.allocate();

diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 7f037fc..45dc402 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,9 +31,9 @@
 CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager),
       _permute_deltas_kernel(),
-      _flatten_deltas_kernel(),
+      _flatten_deltas(),
       _permute_scores_kernel(),
-      _flatten_scores_kernel(),
+      _flatten_scores(),
       _compute_anchors_kernel(),
       _bounding_box_kernel(),
       _pad_kernel(),
@@ -102,12 +102,12 @@
     {
         _memory_group.manage(&_deltas_permuted);
         _permute_deltas_kernel.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_deltas_kernel.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
+        _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_deltas_kernel.configure(compile_context, deltas, &_deltas_flattened);
+        _flatten_deltas.configure(compile_context, deltas, &_deltas_flattened);
     }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
@@ -119,12 +119,12 @@
     {
         _memory_group.manage(&_scores_permuted);
         _permute_scores_kernel.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_scores_kernel.configure(compile_context, &_scores_permuted, &_scores_flattened);
+        _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_scores_kernel.configure(compile_context, scores, &_scores_flattened);
+        _flatten_scores.configure(compile_context, scores, &_scores_flattened);
     }
 
     CLTensor *anchors_to_use = &_all_anchors;
@@ -240,12 +240,12 @@
     }
 
     TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
     TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
     TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
     TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
@@ -350,8 +350,8 @@
         CLScheduler::get().enqueue(_permute_deltas_kernel, false);
         CLScheduler::get().enqueue(_permute_scores_kernel, false);
     }
-    CLScheduler::get().enqueue(_flatten_deltas_kernel, false);
-    CLScheduler::get().enqueue(_flatten_scores_kernel, false);
+    _flatten_deltas.run();
+    _flatten_scores.run();
 
     if(_is_qasymm8)
     {

diff --git a/src/runtime/CL/functions/CLHOGDescriptor.cpp b/src/runtime/CL/functions/CLHOGDescriptor.cpp
index 0645cfd..21fa669 100644
--- a/src/runtime/CL/functions/CLHOGDescriptor.cpp
+++ b/src/runtime/CL/functions/CLHOGDescriptor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLHOGDetector.cpp b/src/runtime/CL/functions/CLHOGDetector.cpp
index bf9bae1..9188f65 100644
--- a/src/runtime/CL/functions/CLHOGDetector.cpp
+++ b/src/runtime/CL/functions/CLHOGDetector.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLHOGGradient.cpp b/src/runtime/CL/functions/CLHOGGradient.cpp
index acf5f2c..934d1f6 100644
--- a/src/runtime/CL/functions/CLHOGGradient.cpp
+++ b/src/runtime/CL/functions/CLHOGGradient.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 248f730..51db43c 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index aecec0d..45b93a5 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLHistogram.cpp b/src/runtime/CL/functions/CLHistogram.cpp
index e723024..f278cf0 100644
--- a/src/runtime/CL/functions/CLHistogram.cpp
+++ b/src/runtime/CL/functions/CLHistogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index 273a873..fce1fe4 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLIntegralImage.cpp b/src/runtime/CL/functions/CLIntegralImage.cpp
index b3be2f8..8561494 100644
--- a/src/runtime/CL/functions/CLIntegralImage.cpp
+++ b/src/runtime/CL/functions/CLIntegralImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 14c83cd..66191d1 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index 56f22e2..058b602 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,7 +55,7 @@
                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
+                            const ICLTensor *output_state_in, ICLTensor *cell_state_in,
                             ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
                             const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
 {
@@ -68,7 +68,7 @@
                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
+                            const ICLTensor *output_state_in, ICLTensor *cell_state_in,
                             ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
                             const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
 {
@@ -110,7 +110,7 @@
     _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_forget_gate_out2);
-    _concat_inputs_forget_gate.configure(compile_context, input, output_state_in, &_forget_gate_out2);
+    _concat_inputs_forget_gate.configure(compile_context, inputs_vector, &_forget_gate_out2, Window::DimX);
 
     std::vector<const ICLTensor *> weights_vector;
 
@@ -119,7 +119,7 @@
     const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
     _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
 
-    _concat_weights_forget_gate.configure(compile_context, input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
+    _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
     _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
@@ -155,7 +155,7 @@
                                                    RoundingPolicy::TO_NEAREST_EVEN);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
@@ -173,7 +173,7 @@
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _ones_memset_kernel.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
-        _subtract_input_gate.configure(compile_context, ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -188,7 +188,7 @@
         TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
         _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
 
-        _concat_weights_input_gate.configure(compile_context, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
+        _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX);
 
         _memory_group.manage(&_input_gate_out1);
 
@@ -222,7 +222,7 @@
                                                       RoundingPolicy::TO_NEAREST_EVEN);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
@@ -246,7 +246,7 @@
     _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     CLTensor *cell_state_out_ptr = &_cell_state_out4;
     if(_is_layer_norm_lstm)
     {
@@ -259,7 +259,7 @@
                                                  RoundingPolicy::TO_NEAREST_EVEN);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
@@ -268,7 +268,7 @@
     _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     cell_state_out_ptr->allocator()->allocate();
     _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _accum_cell_state2.configure(compile_context, ArithmeticOperation::ADD, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
@@ -290,7 +290,7 @@
     TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
     _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
 
-    _concat_weights_output.configure(compile_context, input_to_output_weights, recurrent_to_output_weights, &_output2);
+    _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX);
 
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
@@ -329,7 +329,7 @@
                                                    RoundingPolicy::TO_NEAREST_EVEN);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(compile_context, ArithmeticOperation::ADD, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
@@ -371,7 +371,7 @@
     _copy_output.configure(compile_context, output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
-    std::vector<ICLTensor *> scratch_inputs;
+    std::vector<const ICLTensor *> scratch_inputs;
     if(!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
@@ -485,21 +485,21 @@
     const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
     if(lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
         ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
     if(lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_NEAREST_EVEN));
         ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
     if(!lstm_params.has_cifg_opt())
@@ -516,7 +516,7 @@
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
         TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
         TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
 
         ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
@@ -524,21 +524,21 @@
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
             ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
         if(lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
             ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
@@ -548,18 +548,18 @@
     if(lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_NEAREST_EVEN));
         ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
     ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
     if(cell_threshold != 0.f)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
-                                                                                                                    cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
+                                                                                                              cell_threshold)));
     }
 
     std::vector<const ITensorInfo *> in_out_weights;
@@ -567,35 +567,35 @@
     in_out_weights.emplace_back(recurrent_to_output_weights);
     TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
     TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
-    ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
     // Validate output gate tmp
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
     if(lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_NEAREST_EVEN));
         ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
     }
     if(lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_NEAREST_EVEN));
         ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
     if(lstm_params.has_projection())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
         if(projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(output_state_out, output_state_out,
-                                                                          ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out,
+                                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
         }
     }
 
@@ -604,7 +604,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(output_state_out, output));
 
     // Validate scratch concatenation
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     if(!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
@@ -623,27 +623,27 @@
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    CLScheduler::get().enqueue(_concat_inputs_forget_gate);
+    _concat_inputs_forget_gate.run();
 
     _fully_connected_forget_gate.run();
 
     if(_run_peephole_opt)
     {
-        CLScheduler::get().enqueue(_pixelwise_mul_forget_gate);
+        _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
     if(_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
-        CLScheduler::get().enqueue(_pixelwise_mul_forget_gate_coeff);
-        CLScheduler::get().enqueue(_accum_forget_gate_bias);
+        _pixelwise_mul_forget_gate_coeff.run();
+        _accum_forget_gate_bias.run();
     }
-    CLScheduler::get().enqueue(_activation_forget_gate);
+    _activation_forget_gate.run();
 
     if(_run_cifg_opt)
     {
         CLScheduler::get().enqueue(_ones_memset_kernel);
-        CLScheduler::get().enqueue(_subtract_input_gate);
+        _subtract_input_gate.run();
     }
     else
     {
@@ -651,63 +651,63 @@
 
         if(_run_peephole_opt)
         {
-            CLScheduler::get().enqueue(_pixelwise_mul_input_gate);
+            _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
         if(_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
-            CLScheduler::get().enqueue(_pixelwise_mul_input_gate_coeff);
-            CLScheduler::get().enqueue(_accum_input_gate_bias);
+            _pixelwise_mul_input_gate_coeff.run();
+            _accum_input_gate_bias.run();
         }
-        CLScheduler::get().enqueue(_activation_input_gate);
+        _activation_input_gate.run();
     }
 
     _fully_connected_cell_state.run();
     CLScheduler::get().enqueue(_transpose_cell_state);
     _gemm_cell_state1.run();
-    CLScheduler::get().enqueue(_accum_cell_state1);
+    _accum_cell_state1.run();
     if(_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
-        CLScheduler::get().enqueue(_pixelwise_mul_cell_gate_coeff);
-        CLScheduler::get().enqueue(_accum_cell_gate_bias);
+        _pixelwise_mul_cell_gate_coeff.run();
+        _accum_cell_gate_bias.run();
     }
-    CLScheduler::get().enqueue(_activation_cell_state);
-    CLScheduler::get().enqueue(_pixelwise_mul_cell_state1);
-    CLScheduler::get().enqueue(_pixelwise_mul_cell_state2);
-    CLScheduler::get().enqueue(_accum_cell_state2);
+    _activation_cell_state.run();
+    _pixelwise_mul_cell_state1.run();
+    _pixelwise_mul_cell_state2.run();
+    _accum_cell_state2.run();
 
     if(_perform_cell_clipping)
     {
-        CLScheduler::get().enqueue(_cell_clip);
+        _cell_clip.run();
     }
 
     _fully_connected_output.run();
 
     if(_run_peephole_opt)
     {
-        CLScheduler::get().enqueue(_pixelwise_mul_output_state1);
+        _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
     if(_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
-        CLScheduler::get().enqueue(_pixelwise_mul_output_gate_coeff);
-        CLScheduler::get().enqueue(_accum_output_gate_bias);
+        _pixelwise_mul_output_gate_coeff.run();
+        _accum_output_gate_bias.run();
     }
-    CLScheduler::get().enqueue(_activation_output);
+    _activation_output.run();
 
-    CLScheduler::get().enqueue(_activation_output_state);
-    CLScheduler::get().enqueue(_pixelwise_mul_output_state2);
+    _activation_output_state.run();
+    _pixelwise_mul_output_state2.run();
 
     if(_has_projection_weights)
     {
         _fully_connected_output_state.run();
         if(_perform_projection_clipping)
         {
-            CLScheduler::get().enqueue(_projection_clip);
+            _projection_clip.run();
         }
     }
 
@@ -721,12 +721,12 @@
 {
     if(!_is_prepared)
     {
-        CLScheduler::get().enqueue(_concat_weights_forget_gate);
+        _concat_weights_forget_gate.run();
         if(!_run_cifg_opt)
         {
-            CLScheduler::get().enqueue(_concat_weights_input_gate);
+            _concat_weights_input_gate.run();
         }
-        CLScheduler::get().enqueue(_concat_weights_output);
+        _concat_weights_output.run();
         _is_prepared = true;
     }
 }

diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index c57fcc9..e30b1db 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,9 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
-#include <cmath>
 #include <memory>
-#include <tuple>
 
 namespace arm_compute
 {
@@ -275,6 +273,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
                                         recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
                                         output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8);
 
     const int input_size  = input->dimension(0);
     const int batch_size  = input->dimension(1);

diff --git a/src/runtime/CL/functions/CLLaplacianPyramid.cpp b/src/runtime/CL/functions/CLLaplacianPyramid.cpp
index 831f0cd..81e903c 100644
--- a/src/runtime/CL/functions/CLLaplacianPyramid.cpp
+++ b/src/runtime/CL/functions/CLLaplacianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
index ea6a3f9..cbb952c 100644
--- a/src/runtime/CL/functions/CLLaplacianReconstruct.cpp
+++ b/src/runtime/CL/functions/CLLaplacianReconstruct.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,7 +77,7 @@
     // Scale levels n-1 to 1, and add levels n-2 to 0
     for(size_t l = 0; l < last_level; ++l)
     {
-        _scalef[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
+        _scalef[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value });
         _addf[l].configure(compile_context, _tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
     }
 

diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 950be50..d501985 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLMagnitude.cpp b/src/runtime/CL/functions/CLMagnitude.cpp
index a267952..962adad 100644
--- a/src/runtime/CL/functions/CLMagnitude.cpp
+++ b/src/runtime/CL/functions/CLMagnitude.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
new file mode 100644
index 0000000..3e32c55
--- /dev/null
+++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp

@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+namespace arm_compute
+{
+CLMaxUnpoolingLayer::CLMaxUnpoolingLayer()
+    : _memset_kernel(), _unpooling_layer_kernel()
+{
+}
+
+void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info);
+}
+
+void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+{
+    const PixelValue zero_value(0.f);
+    _memset_kernel.configure(output, zero_value);
+
+    _unpooling_layer_kernel.configure(compile_context, input, indices, output, pool_info);
+}
+
+Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
+}
+
+void CLMaxUnpoolingLayer::run()
+{
+    // Run memset
+    CLScheduler::get().enqueue(_memset_kernel, false);
+
+    // Run max unpooling layer
+    CLScheduler::get().enqueue(_unpooling_layer_kernel);
+}
+} /* namespace arm_compute */

diff --git a/src/runtime/CL/functions/CLMeanStdDev.cpp b/src/runtime/CL/functions/CLMeanStdDev.cpp
index e3ce704..2517fdc 100644
--- a/src/runtime/CL/functions/CLMeanStdDev.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDev.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index 3dbab76..07ab669 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLMedian3x3.cpp b/src/runtime/CL/functions/CLMedian3x3.cpp
index dc53240..9215312 100644
--- a/src/runtime/CL/functions/CLMedian3x3.cpp
+++ b/src/runtime/CL/functions/CLMedian3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLMinMaxLocation.cpp b/src/runtime/CL/functions/CLMinMaxLocation.cpp
index 15b2833..a27defe 100644
--- a/src/runtime/CL/functions/CLMinMaxLocation.cpp
+++ b/src/runtime/CL/functions/CLMinMaxLocation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLNonLinearFilter.cpp b/src/runtime/CL/functions/CLNonLinearFilter.cpp
index 96912a2..71f08e8 100644
--- a/src/runtime/CL/functions/CLNonLinearFilter.cpp
+++ b/src/runtime/CL/functions/CLNonLinearFilter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
index 6d4a28d..a79bb0c 100644
--- a/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp
+++ b/src/runtime/CL/functions/CLNonMaximaSuppression3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index f59a4ca..4be6257 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index b03de64..806e648 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLOpticalFlow.cpp b/src/runtime/CL/functions/CLOpticalFlow.cpp
index 5f7c170..0b5547e 100644
--- a/src/runtime/CL/functions/CLOpticalFlow.cpp
+++ b/src/runtime/CL/functions/CLOpticalFlow.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index 6543ab9..e03bd13 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
 #include "support/MemorySupport.h"
 
@@ -31,26 +32,45 @@
 {
 namespace
 {
-void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ITensorInfo *input1, ITensorInfo *input2, const ITensorInfo *output)
 {
-    if(output->info()->dimension(0) > 1)
+    if(output->dimension(0) > 1)
     {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+        ITensorInfo *broadcasted_info = (input1->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if(broadcasted_info->dimension(0) == 1)
         {
             border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
         }
     }
 }
+
+ITensorPack select_border_input(ITensorPack &tensors)
+{
+    ITensorPack pack;
+    if(tensors.get_tensor(TensorType::ACL_DST)->info()->dimension(0) > 1)
+    {
+        if(tensors.get_const_tensor(TensorType::ACL_SRC_1)->info()->dimension(0) == 1)
+        {
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1));
+        }
+        else
+        {
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_0));
+        }
+    }
+    return pack;
+}
 } // namespace
 
-void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+namespace experimental
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
+CLPReluLayer::CLPReluLayer()
+    : _border_handler()
+{
 }
 
-void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+void CLPReluLayer::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, output);
@@ -62,4 +82,57 @@
 {
     return CLArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
 }
+
+void CLPReluLayer::run(ITensorPack &tensors)
+{
+    auto border_pack = select_border_input(tensors);
+    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    ICLOperator::run(tensors);
+}
+} // namespace experimental
+
+struct CLPReluLayer::Impl
+{
+    const ICLTensor                            *src_0{ nullptr };
+    const ICLTensor                            *src_1{ nullptr };
+    ICLTensor                                  *dst{ nullptr };
+    std::unique_ptr<experimental::CLPReluLayer> op{ nullptr };
+};
+
+CLPReluLayer::CLPReluLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default;
+CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default;
+CLPReluLayer::~CLPReluLayer()                          = default;
+
+void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
+}
+
+void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+    _impl->src_0 = input;
+    _impl->src_1 = alpha;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLPReluLayer>();
+    _impl->op->configure(compile_context, input->info(), alpha->info(), output->info());
+}
+
+Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+    return experimental::CLPReluLayer::validate(input, alpha, output);
+}
+
+void CLPReluLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 078bdbc..12a51f1 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index e6323ce..e13046b 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLPhase.cpp b/src/runtime/CL/functions/CLPhase.cpp
index b915104..64d2e0f 100644
--- a/src/runtime/CL/functions/CLPhase.cpp
+++ b/src/runtime/CL/functions/CLPhase.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 3c1a7de..883ce68 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,30 +25,52 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+namespace
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+ITensorPack select_border_input(ITensorPack &tensors)
+{
+    ITensorPack pack;
+    if(tensors.get_tensor(TensorType::ACL_DST)->info()->dimension(0) > 1)
+    {
+        if(tensors.get_const_tensor(TensorType::ACL_SRC_1)->info()->dimension(0) == 1)
+        {
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1));
+        }
+        else
+        {
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_0));
+        }
+    }
+    return pack;
+}
+} // namespace
+
+namespace experimental
+{
+CLPixelWiseMultiplication::CLPixelWiseMultiplication()
+    : _border_handler()
+{
 }
 
-void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
+void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale,
                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseMultiplicationKernel>();
     k->configure(compile_context, input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if(output->dimension(0) > 1)
     {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+        ITensorInfo *broadcasted_info = (input1->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if(broadcasted_info->dimension(0) == 1)
         {
             _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
@@ -61,22 +83,29 @@
     return CLPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::run(ITensorPack &tensors)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+    auto border_pack = select_border_input(tensors);
+    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    ICLOperator::run(tensors);
 }
 
-void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication()
+    : _border_handler()
+{
+}
+
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLComplexPixelWiseMultiplicationKernel>();
     k->configure(compile_context, input1, input2, output, act_info);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if(output->dimension(0) > 1)
     {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+        ITensorInfo *broadcasted_info = (input1->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if(broadcasted_info->dimension(0) == 1)
         {
             _border_handler.configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
         }
@@ -87,4 +116,105 @@
 {
     return CLComplexPixelWiseMultiplicationKernel::validate(input1, input2, output, act_info);
 }
+
+void CLComplexPixelWiseMultiplication::run(ITensorPack &tensors)
+{
+    auto border_pack = select_border_input(tensors);
+    CLScheduler::get().enqueue_op(_border_handler, border_pack);
+    ICLOperator::run(tensors);
+}
+} // namespace experimental
+
+struct CLPixelWiseMultiplication::Impl
+{
+    const ICLTensor                                         *src_0{ nullptr };
+    const ICLTensor                                         *src_1{ nullptr };
+    ICLTensor                                               *dst{ nullptr };
+    std::unique_ptr<experimental::CLPixelWiseMultiplication> op{ nullptr };
+};
+
+CLPixelWiseMultiplication::CLPixelWiseMultiplication()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default;
+CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default;
+CLPixelWiseMultiplication::~CLPixelWiseMultiplication()                                       = default;
+
+void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
+                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+}
+
+void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
+                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLPixelWiseMultiplication>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+}
+
+Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
+                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+{
+    return experimental::CLPixelWiseMultiplication::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+}
+
+void CLPixelWiseMultiplication::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
+
+struct CLComplexPixelWiseMultiplication::Impl
+{
+    const ICLTensor                                                *src_0{ nullptr };
+    const ICLTensor                                                *src_1{ nullptr };
+    ICLTensor                                                      *dst{ nullptr };
+    std::unique_ptr<experimental::CLComplexPixelWiseMultiplication> op{ nullptr };
+};
+
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default;
+CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
+CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication()                                              = default;
+
+void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
+}
+
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLComplexPixelWiseMultiplication>();
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return experimental::CLComplexPixelWiseMultiplication::validate(input1, input2, output, act_info);
+}
+
+void CLComplexPixelWiseMultiplication::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
+}
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index e7735b0..a14818f 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index d01b4c7..1907c7c 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 8b21424..a40a5d0 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -113,7 +113,7 @@
                              const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                              const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                              const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                             ICLTensor *cell_state_in, const ICLTensor *output_state_in,
                              ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
@@ -126,7 +126,7 @@
                              const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
                              const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
                              const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
+                             ICLTensor *cell_state_in, const ICLTensor *output_state_in,
                              ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
@@ -211,6 +211,10 @@
     if(_has_projection)
     {
         _projection_reduction.configure(compile_context, _projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if(_projection_bias != nullptr)
+        {
+            _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+        }
     }
 
     // Pre-transpose weights to be used in GEMM.
@@ -251,7 +255,7 @@
                  &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
                  mm_out_info, forget_gate_outstage_info);
 
-    _accumulate_input_recurrent_forget.configure(compile_context, ArithmeticOperation::ADD, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+    _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
@@ -266,7 +270,7 @@
         quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
         _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+        _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                           ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
@@ -303,7 +307,7 @@
                  &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
                  mm_out_info, cell_outstage_info);
 
-    _accumulate_input_recurrent_modulation.configure(compile_context, ArithmeticOperation::ADD, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+    _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
                                                      ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
@@ -329,7 +333,7 @@
     if(_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
-        _input_gate_sub.configure(compile_context, ArithmeticOperation::SUB, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
+        _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
     }
     else
@@ -346,7 +350,7 @@
                      output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(compile_context, ArithmeticOperation::ADD, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
+        _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
                                                     ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
@@ -361,7 +365,7 @@
             _memory_group.manage(&_cell_to_input_outstage_res);
             _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
-            _accumulate_cell_input.configure(ArithmeticOperation::ADD, &_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
@@ -378,7 +382,7 @@
         input_activation_input->allocator()->allocate();
     }
     // Cell.
-    // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
+    // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
     _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
@@ -387,7 +391,7 @@
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
     _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
-    _add_forget_cell.configure(compile_context, ArithmeticOperation::ADD, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
+    _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
     if(_has_cell_clipping)
@@ -408,13 +412,13 @@
                  &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
                  mm_out_info, output_outstage_info);
 
-    _accumulate_input_recurrent_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+    _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
     if(_has_peephole)
     {
-        // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
+        // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
@@ -427,7 +431,7 @@
         _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
         _mul_cell_to_output_res.allocator()->allocate();
 
-        _accumulate_cell_to_output.configure(compile_context, ArithmeticOperation::ADD, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+        _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                              ConvertPolicy::SATURATE);
         _cell_to_output_outstage_res.allocator()->allocate();
     }
@@ -449,7 +453,7 @@
 
     // Hidden.
     _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
-    // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplicationKernel
+    // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
@@ -506,7 +510,7 @@
             accumulate_destination = &_projection_accumulate_res;
         }
 
-        _accumulate_projection.configure(compile_context, ArithmeticOperation::ADD, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
         if(_projection_tensor_copy_required)
@@ -640,6 +644,12 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
                                                                                lstm_params.hidden_state_zero(),
                                                                                true)));
+        if(lstm_params.projection_bias() != nullptr)
+        {
+            ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+                                                                       &projection_eff_bias_info, ConvertPolicy::SATURATE));
+        }
     }
 
     const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
@@ -672,6 +682,7 @@
     const bool has_layer_norm = lstm_params.use_layer_norm();
 
     // Forget gate.
+    ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
     const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
@@ -680,17 +691,17 @@
     const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
 
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
         ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
     if(has_layer_norm)
@@ -707,6 +718,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
+    ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
     const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
     const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
@@ -714,7 +726,7 @@
     const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &input_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
 
     if(has_layer_norm)
     {
@@ -731,7 +743,7 @@
     if(lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::SUB, &input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
@@ -742,6 +754,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
         const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
         const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
         ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
@@ -749,16 +762,16 @@
         const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
         ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
 
         if(lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                                  RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
+                                                                            RoundingPolicy::TO_ZERO));
             const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
             ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
             ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
         if(has_layer_norm)
@@ -771,15 +784,16 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
     if(quantized_cell_clip > 0)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
                                                                                                              quantized_cell_clip)));
     }
     // Output gate.
+    ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
     const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
     const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
@@ -787,7 +801,7 @@
     const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
@@ -795,9 +809,9 @@
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
     if(has_layer_norm)
@@ -815,7 +829,8 @@
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
     ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
     gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
@@ -827,7 +842,7 @@
     if(lstm_params.has_projection())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.projection_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
         const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
         const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
@@ -851,7 +866,7 @@
             ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
 
         if(projection_tensor_copy_required)
         {
@@ -907,13 +922,13 @@
 
     _mm_recurrent_to_forget.run();
     _recurrent_to_forget_outstage.run();
-    CLScheduler::get().enqueue(_accumulate_input_recurrent_forget);
+    _accumulate_input_recurrent_forget.run();
 
     if(_has_peephole)
     {
-        CLScheduler::get().enqueue(_pixelwise_mul_cell_to_forget);
+        _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
-        CLScheduler::get().enqueue(_accumulate_cell_forget);
+        _accumulate_cell_forget.run();
     }
 
     if(_has_layer_norm)
@@ -929,7 +944,7 @@
 
     _mm_recurrent_to_cell.run();
     _recurrent_to_cell_outstage.run();
-    CLScheduler::get().enqueue(_accumulate_input_recurrent_modulation);
+    _accumulate_input_recurrent_modulation.run();
 
     if(_has_layer_norm)
     {
@@ -941,7 +956,7 @@
     // Input gate
     if(_has_cifg)
     {
-        CLScheduler::get().enqueue(_input_gate_sub);
+        _input_gate_sub.run();
     }
     else
     {
@@ -949,13 +964,13 @@
         _input_to_input_outstage.run();
         _mm_recurrent_to_input.run();
         _recurrent_to_input_outstage.run();
-        CLScheduler::get().enqueue(_accumulate_input_recurrent_input);
+        _accumulate_input_recurrent_input.run();
 
         if(_has_peephole)
         {
-            CLScheduler::get().enqueue(_pixelwise_mul_cell_to_input);
+            _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
-            CLScheduler::get().enqueue(_accumulate_cell_input);
+            _accumulate_cell_input.run();
         }
 
         if(_has_layer_norm)
@@ -967,9 +982,9 @@
     }
 
     // Cell.
-    CLScheduler::get().enqueue(_pixelwise_mul_forget_cell);
-    CLScheduler::get().enqueue(_pixelwise_mul_input_cell);
-    CLScheduler::get().enqueue(_add_forget_cell);
+    _pixelwise_mul_forget_cell.run();
+    _pixelwise_mul_input_cell.run();
+    _add_forget_cell.run();
     if(_has_cell_clipping)
     {
         _cell_clip.run();
@@ -980,12 +995,12 @@
     _input_to_output_outstage.run();
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
-    CLScheduler::get().enqueue(_accumulate_input_recurrent_output);
+    _accumulate_input_recurrent_output.run();
     if(_has_peephole)
     {
-        CLScheduler::get().enqueue(_pixelwise_mul_cell_to_output);
+        _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
-        CLScheduler::get().enqueue(_accumulate_cell_to_output);
+        _accumulate_cell_to_output.run();
     }
 
     if(_has_layer_norm)
@@ -997,7 +1012,7 @@
 
     // Hidden.
     _hidden_tanh.run();
-    CLScheduler::get().enqueue(_pixelwise_mul_hidden);
+    _pixelwise_mul_hidden.run();
     _hidden_outstage.run();
 
     // Projection.
@@ -1011,7 +1026,7 @@
             _projection_output_to_accumulate_copy.run();
         }
 
-        CLScheduler::get().enqueue(_accumulate_projection);
+        _accumulate_projection.run();
 
         if(_projection_tensor_copy_required)
         {
@@ -1089,10 +1104,11 @@
 
         if(_has_projection)
         {
+            _projection_eff_bias.allocator()->allocate();
+            CLScheduler::get().enqueue(_projection_reduction);
             if(_projection_bias != nullptr)
             {
-                _projection_eff_bias.allocator()->allocate();
-                CLScheduler::get().enqueue(_projection_reduction);
+                _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
             }
 

diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index 6239f27..f0a446a 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 57b8d70..94e7f94 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,13 +29,12 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include <utility>
-
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
       _is_prepared(false)
 {
 }
@@ -43,9 +42,13 @@
 Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
                             const ITensorInfo *output, const ActivationLayerInfo &info)
 {
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, recurrent_weights, bias, hidden_state, output);
+
     const int idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) != recurrent_weights->dimension(idx_width));
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) != recurrent_weights->dimension(1));
@@ -59,8 +62,8 @@
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
@@ -96,12 +99,12 @@
     _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _memory_group.manage(&_add_output);
 
-    _add_kernel.configure(compile_context, ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+    _add_kernel.configure(compile_context, &_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
 
     _fully_connected_out.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 
-    _activation_kernel.configure(compile_context, &_add_output, hidden_state, info);
+    _activation.configure(compile_context, &_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
 
     _copy_kernel.configure(compile_context, hidden_state, output);
@@ -115,8 +118,8 @@
 
     _fully_connected_kernel.run();
     _gemm_state_f.run();
-    CLScheduler::get().enqueue(_add_kernel);
-    CLScheduler::get().enqueue(_activation_kernel);
+    _add_kernel.run();
+    _activation.run();
 
     // copy hidden out to output
     CLScheduler::get().enqueue(_copy_kernel);
@@ -132,3 +135,4 @@
         _is_prepared = true;
     }
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 43b58dd..2337cee 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index bb54cfa..cdf60ce 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index b29b03d..8bf2a0c 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index ce44763..c8eb542 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index b659ecf..54e91fb 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,22 +24,19 @@
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/Utils.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape_kernel(), _op(), _num_of_stages(), _reduction_axis(), _is_serial(),
+    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _reshape(), _num_of_stages(), _reduction_axis(), _is_serial(),
       _is_reshape_required(false)
 {
 }
@@ -152,7 +149,7 @@
 
     if(is_reshape_required)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(output_internal, output));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output));
     }
 
     return Status{};
@@ -197,7 +194,6 @@
 void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    _op                  = op;
     _num_of_stages       = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
     _reduction_axis      = axis;
     _is_serial           = needs_serialized_reduction(op, input->info()->data_type(), axis);
@@ -259,65 +255,13 @@
                 first_kernel_op        = ReductionOperation::MIN;
                 intermediate_kernel_op = ReductionOperation::MIN;
                 last_kernel_op         = ReductionOperation::MIN;
-                switch(input->info()->data_type())
-                {
-                    case DataType::F32:
-                    {
-                        pixelValue = PixelValue(std::numeric_limits<float>::max());
-                        break;
-                    }
-                    case DataType::F16:
-                    {
-                        pixelValue = PixelValue(static_cast<half>(65504.0f));
-                        break;
-                    }
-                    case DataType::QASYMM8:
-                    {
-                        pixelValue = std::get<1>(get_min_max(input->info()->data_type()));
-                        break;
-                    }
-                    case DataType::QASYMM8_SIGNED:
-                    {
-                        pixelValue = PixelValue(127, input->info()->data_type(), input->info()->quantization_info());
-                        break;
-                    }
-                    default:
-                    {
-                        ARM_COMPUTE_ERROR("Unsupported DataType");
-                    }
-                }
+                pixelValue             = std::get<1>(get_min_max(input->info()->data_type()));
                 break;
             case ReductionOperation::MAX:
                 first_kernel_op        = ReductionOperation::MAX;
                 intermediate_kernel_op = ReductionOperation::MAX;
                 last_kernel_op         = ReductionOperation::MAX;
-                switch(input->info()->data_type())
-                {
-                    case DataType::F32:
-                    {
-                        pixelValue = PixelValue(-std::numeric_limits<float>::max());
-                        break;
-                    }
-                    case DataType::F16:
-                    {
-                        pixelValue = PixelValue(static_cast<half>(-65504.0f));
-                        break;
-                    }
-                    case DataType::QASYMM8:
-                    {
-                        pixelValue = std::get<0>(get_min_max(input->info()->data_type()));
-                        break;
-                    }
-                    case DataType::QASYMM8_SIGNED:
-                    {
-                        pixelValue = PixelValue(-128, input->info()->data_type(), input->info()->quantization_info());
-                        break;
-                    }
-                    default:
-                    {
-                        ARM_COMPUTE_ERROR("Unsupported DataType");
-                    }
-                }
+                pixelValue             = std::get<0>(get_min_max(input->info()->data_type()));
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
@@ -351,7 +295,7 @@
 
     if(_is_reshape_required)
     {
-        _reshape_kernel.configure(compile_context, &_results_vector.back(), output);
+        _reshape.configure(compile_context, &_results_vector.back(), output);
         _results_vector.back().allocator()->allocate();
     }
 }
@@ -375,7 +319,7 @@
 
     if(_is_reshape_required)
     {
-        CLScheduler::get().enqueue(_reshape_kernel, false);
+        _reshape.run();
     }
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLRemap.cpp b/src/runtime/CL/functions/CLRemap.cpp
index af241ec..60b72c5 100644
--- a/src/runtime/CL/functions/CLRemap.cpp
+++ b/src/runtime/CL/functions/CLRemap.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index ea93314..1dc41ae 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index 13baedb..273a761 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,38 @@
 #include "support/MemorySupport.h"
 
 /** [CLReshapeLayer snippet] **/
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace experimental
+{
+void CLReshape::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CLReshapeLayerKernel>();
+    k->configure(compile_context, input, output);
+    _kernel = std::move(k);
+}
+
+Status CLReshape::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return arm_compute::CLReshapeLayerKernel::validate(input, output);
+}
+} // namespace experimental
+
+struct CLReshapeLayer::Impl
+{
+    const ICLTensor                         *src{ nullptr };
+    ICLTensor                               *dst{ nullptr };
+    std::unique_ptr<experimental::CLReshape> op{ nullptr };
+};
+
+CLReshapeLayer::CLReshapeLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default;
+CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default;
+CLReshapeLayer::~CLReshapeLayer()                            = default;
 
 void CLReshapeLayer::configure(const ICLTensor *input, ICLTensor *output)
 {
@@ -37,13 +68,26 @@
 
 void CLReshapeLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CLReshapeLayerKernel>();
-    k->configure(compile_context, input, output);
-    _kernel = std::move(k);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLReshape>();
+    _impl->op->configure(compile_context, input->info(), output->info());
 }
 
 Status CLReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return CLReshapeLayerKernel::validate(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(experimental::CLReshape::validate(input, output));
+
+    return Status{};
 }
+
+void CLReshapeLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+} // namespace arm_compute
 /** [CLReshapeLayer snippet] **/

diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 3c8bc15..213fbc8 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index a9395bd..e111c6d 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,38 +30,53 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/MemorySupport.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+void CLScale::configure(ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
+}
 
 void CLScale::configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
                         bool align_corners)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners });
 }
 
-void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,
-                        SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
+void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
 {
-    ARM_COMPUTE_UNUSED(use_padding);
     auto k = arm_compute::support::cpp14::make_unique<CLScaleKernel>();
     k->set_target(CLScheduler::get().target());
-    k->configure(compile_context, input, output, policy, border_mode, sampling_policy, align_corners);
+    k->configure(compile_context, input, output, info);
     _kernel = std::move(k);
 
     // Tune kernels
     CLScheduler::get().tune_kernel_static(*_kernel);
 
+    auto border_mode_to_use = info.border_mode;
     // In the case of NHWC we can't have undefined border mode as this would require to access elements outside z dimension,
     // so we treat it like border constant.
-    if(border_mode == BorderMode::UNDEFINED && input->info()->data_layout() == DataLayout::NHWC)
+    if(info.border_mode == BorderMode::UNDEFINED && input->info()->data_layout() == DataLayout::NHWC)
     {
-        border_mode = BorderMode::CONSTANT;
+        border_mode_to_use = BorderMode::CONSTANT;
     }
-    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode, constant_border_value);
+    _border_handler.configure(compile_context, input, _kernel->border_size(), border_mode_to_use, info.constant_border_value);
+}
+
+void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,
+                        SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
+{
+    configure(compile_context, input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners });
 }
 
 Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy,
                          bool use_padding, bool align_corners)
 {
-    ARM_COMPUTE_UNUSED(constant_border_value, use_padding);
-    return CLScaleKernel::validate(input, output, policy, border_mode, sampling_policy, align_corners);
+    return CLScale::validate(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners });
 }
+
+Status CLScale::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
+{
+    return CLScaleKernel::validate(input, output, info);
+}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLScharr3x3.cpp b/src/runtime/CL/functions/CLScharr3x3.cpp
index faad542..b121ee7 100644
--- a/src/runtime/CL/functions/CLScharr3x3.cpp
+++ b/src/runtime/CL/functions/CLScharr3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index 7187010..c7d7df7 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index e8cc0f5..f36550b 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,12 +31,9 @@
 
 namespace arm_compute
 {
-void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+namespace experimental
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
-}
-
-void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -63,4 +60,46 @@
 
     return CLStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
 }
+} // namespace experimental
+
+struct CLSlice::Impl
+{
+    const ICLTensor                       *src{ nullptr };
+    ICLTensor                             *dst{ nullptr };
+    std::unique_ptr<experimental::CLSlice> op{ nullptr };
+};
+
+CLSlice::CLSlice()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLSlice::CLSlice(CLSlice &&) = default;
+CLSlice &CLSlice::operator=(CLSlice &&) = default;
+CLSlice::~CLSlice()                     = default;
+
+Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+{
+    return experimental::CLSlice::validate(input, output, starts, ends);
+}
+
+void CLSlice::configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
+}
+
+void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::CLSlice>();
+    _impl->op->configure(compile_context, input->info(), output->info(), starts, ends);
+}
+
+void CLSlice::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLSobel3x3.cpp b/src/runtime/CL/functions/CLSobel3x3.cpp
index c3604f9..566a4a1 100644
--- a/src/runtime/CL/functions/CLSobel3x3.cpp
+++ b/src/runtime/CL/functions/CLSobel3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLSobel5x5.cpp b/src/runtime/CL/functions/CLSobel5x5.cpp
index f8a33f3..f70e4f3 100644
--- a/src/runtime/CL/functions/CLSobel5x5.cpp
+++ b/src/runtime/CL/functions/CLSobel5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLSobel7x7.cpp b/src/runtime/CL/functions/CLSobel7x7.cpp
index 6d3c7f0..792432e 100644
--- a/src/runtime/CL/functions/CLSobel7x7.cpp
+++ b/src/runtime/CL/functions/CLSobel7x7.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index b0b2117..f7b2935 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,40 +36,43 @@
 {
 template <bool IS_LOG>
 CLSoftmaxLayerGeneric<IS_LOG>::CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_kernel_ptr(), _reshape_kernel(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
+    : _memory_group(std::move(memory_manager)), _max_shift_exp_sum_kernel(), _norm_kernel(), _flatten_ptr(), _reshape(), _max(), _sum(), _tmp(), _input_flattened(), _output_flattened(),
       _needs_flattening(false)
 {
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t first_n_reduce_axes)
 {
-    configure_reshape_input_kernel(CLKernelLibrary::get().get_compile_context(), input, output, axis);
+    configure_reshape_input_kernel(CLKernelLibrary::get().get_compile_context(), input, output, first_n_reduce_axes);
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t first_n_reduce_axes)
 {
     // Flatten the input
-    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
+    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), first_n_reduce_axes);
 
     // Initialize the flat input
     _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
 
     // If we need to flatten the input, we can use CLFlattenKernel or CLReshapeKernel
-    // If flattening on the third axes, we use CLFlattenKernel.
+    // If the number of reduced axes is 3 (max dimension), which means collapsing all axes except the batch axis, we use CLFlattenKernel.
     // In all other cases we have to use CLReshapeKernel
-    if(axis != 3)
+    // Note that the "other cases" include both:
+    //   1. first_n_reduce_axes < 3: Reduce the first 1 (no need to reduce) or 2 dimensions (inclusive)
+    //   2. first_n_reduce_axes == 4: Reduce all 4 dimensions. This can only be handled by CLReshapeKernel instead of CLFlattenKernel.
+    if(first_n_reduce_axes == 3)
     {
-        auto reshape_kernel_ptr = support::cpp14::make_unique<CLReshapeLayerKernel>();
-        reshape_kernel_ptr->configure(compile_context, input, &_input_flattened);
-        _flatten_kernel_ptr = std::move(reshape_kernel_ptr);
+        auto flatten = support::cpp14::make_unique<CLFlattenLayer>();
+        flatten->configure(compile_context, input, &_input_flattened);
+        _flatten_ptr = std::move(flatten);
     }
     else
     {
-        auto flatten_kernel_ptr = support::cpp14::make_unique<CLFlattenLayerKernel>();
-        flatten_kernel_ptr->configure(compile_context, input, &_input_flattened);
-        _flatten_kernel_ptr = std::move(flatten_kernel_ptr);
+        auto reshape_ptr = support::cpp14::make_unique<CLReshapeLayer>();
+        reshape_ptr->configure(compile_context, input, &_input_flattened);
+        _flatten_ptr = std::move(reshape_ptr);
     }
 
     // We need to init the output tensor here. Indeed, the reshape kernel expects
@@ -90,8 +93,11 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLSoftmaxLayerGeneric<IS_LOG>::validate(input->info(), output->info(), beta, axis));
 
-    // We don't need flattening only in the case the input is 2D and axis is 1
-    _needs_flattening = axis != 1;
+    // Convert reduce-before axis (inclusive) to first n axes to reduce
+    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, input->info()->num_dimensions());
+
+    // We only need flattening when the number of axes to reduce is greater than 1
+    _needs_flattening = first_n_reduce_axes > 1;
 
     // If we are dealing with a 4D tensor, we will:
     // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
@@ -102,8 +108,8 @@
         // Add to the memory manager _input_flattened
         _memory_group.manage(&_input_flattened);
 
-        // Cofigure  _flatten_kernel and _input_flattened
-        configure_reshape_input_kernel(input, output, axis);
+        // Cofigure _flatten_kernel and _input_flattened
+        configure_reshape_input_kernel(input, output, first_n_reduce_axes);
     }
 
     // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
@@ -146,7 +152,7 @@
         _norm_kernel.configure(compile_context, &_tmp, &_sum, &_output_flattened, softmax_info);
 
         // Reshape the flat output into a the requested (4D) output
-        _reshape_kernel.configure(compile_context, &_output_flattened, output);
+        _reshape.configure(compile_context, &_output_flattened, output);
 
         // Allocate the intermediate flat tensors
         _input_flattened.allocator()->allocate();
@@ -169,7 +175,12 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 0, "Only axis 0 supported in tensors");
     ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() <= axis);
+
+    // Convert reduce-before axis (inclusive) to first n axes to reduce
+    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, input->num_dimensions());
 
     // Create intermediate tensor info
     DataType   tmp_data_type = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
@@ -180,20 +191,20 @@
     TensorInfo tensor_info_max(input->clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
     TensorInfo tensor_info_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
 
-    const bool needs_flattening = (axis != 1);
+    const bool needs_flattening = (first_n_reduce_axes > 1);
 
     if(needs_flattening)
     {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
+        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, first_n_reduce_axes);
         TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
 
-        if(axis != 3)
+        if(first_n_reduce_axes == 3)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(input, &tensor_info_flat));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayer::validate(input, &tensor_info_flat));
         }
         else
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLFlattenLayerKernel::validate(input, &tensor_info_flat));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(input, &tensor_info_flat));
         }
     }
 
@@ -221,7 +232,7 @@
 
     if(_needs_flattening)
     {
-        CLScheduler::get().enqueue(*_flatten_kernel_ptr, false);
+        _flatten_ptr->run();
     }
 
     CLScheduler::get().enqueue(_max_shift_exp_sum_kernel, false);
@@ -229,7 +240,7 @@
 
     if(_needs_flattening)
     {
-        CLScheduler::get().enqueue(_reshape_kernel, true);
+        _reshape.run();
     }
 }
 

diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 021d316..eea3cb5 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index a4ffefc..06aa92d 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index cdc44d8..db0b14b 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 79c3fe5..39f0ab4 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index 4547596..b78073d 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,20 +23,16 @@
  */
 #include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+namespace experimental
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
-}
-
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
@@ -51,4 +47,58 @@
 {
     return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
+} // namespace experimental
+
+struct CLStridedSlice::Impl
+{
+    const ICLTensor                              *src{ nullptr };
+    ICLTensor                                    *dst{ nullptr };
+    CLRuntimeContext                             *ctx{ nullptr };
+    std::unique_ptr<experimental::CLStridedSlice> op{ nullptr };
+};
+
+CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx)
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+    _impl->ctx = ctx;
+}
+
+CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default;
+CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default;
+CLStridedSlice::~CLStridedSlice()                            = default;
+
+void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
+                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+
+void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _impl->src = input;
+    _impl->dst = output;
+
+    _impl->op = arm_compute::support::cpp14::make_unique<experimental::CLStridedSlice>();
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+
+Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+
+void CLStridedSlice::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLTableLookup.cpp b/src/runtime/CL/functions/CLTableLookup.cpp
index 47e15d3..3d2d185 100644
--- a/src/runtime/CL/functions/CLTableLookup.cpp
+++ b/src/runtime/CL/functions/CLTableLookup.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLThreshold.cpp b/src/runtime/CL/functions/CLThreshold.cpp
index 57c9272..bdbf37e 100644
--- a/src/runtime/CL/functions/CLThreshold.cpp
+++ b/src/runtime/CL/functions/CLThreshold.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,17 +28,22 @@
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, threshold, false_value, true_value, type, upper);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, ThresholdKernelInfo(threshold, false_value, true_value, type, upper));
 }
 
-void CLThreshold::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type,
-                            uint8_t upper)
+void CLThreshold::configure(const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
+}
+
+void CLThreshold::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ThresholdKernelInfo &info)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLThresholdKernel>();
-    k->configure(compile_context, input, output, threshold, false_value, true_value, type, upper);
+    k->configure(compile_context, input, output, info);
     _kernel = std::move(k);
 }
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index 178d7af..68efad0 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index f5121d0..8cade66 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 032fb99..28d122b 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLUpsampleLayer.cpp b/src/runtime/CL/functions/CLUpsampleLayer.cpp
index dd04686..e9456c1 100644
--- a/src/runtime/CL/functions/CLUpsampleLayer.cpp
+++ b/src/runtime/CL/functions/CLUpsampleLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLWarpAffine.cpp b/src/runtime/CL/functions/CLWarpAffine.cpp
index ce2171b..fffc58c 100644
--- a/src/runtime/CL/functions/CLWarpAffine.cpp
+++ b/src/runtime/CL/functions/CLWarpAffine.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLWarpPerspective.cpp b/src/runtime/CL/functions/CLWarpPerspective.cpp
index 06c0661..2b4b187 100644
--- a/src/runtime/CL/functions/CLWarpPerspective.cpp
+++ b/src/runtime/CL/functions/CLWarpPerspective.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 132c3ee..09a35a6 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLWinogradInputTransform.cpp b/src/runtime/CL/functions/CLWinogradInputTransform.cpp
index ae40076..9498206 100644
--- a/src/runtime/CL/functions/CLWinogradInputTransform.cpp
+++ b/src/runtime/CL/functions/CLWinogradInputTransform.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/functions/CLYOLOLayer.cpp b/src/runtime/CL/functions/CLYOLOLayer.cpp
index 0c0c106..d553f97 100644
--- a/src/runtime/CL/functions/CLYOLOLayer.cpp
+++ b/src/runtime/CL/functions/CLYOLOLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
index 041e7d6..8b1c9a5 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
index a94a392..44700ad 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
index 775bb9b..8b4c9e7 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
index 5b23baa..52644bf 100644
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -255,7 +255,7 @@
     cl::NDRange               lws_hint      = k.lws_hint();
     const GPUTarget           gpu_target    = k.get_target();
     const DataType            dt            = k.input()->info()->data_type();
-    const InterpolationPolicy interpolation = k._interpolationPolicy;
+    const InterpolationPolicy interpolation = k.get_interpolation_policy();
 
     // Configure the local work size for Bifrost, interpolation (bilinear) and datatype F32.
     // The value are obtained via exhaustive autotuning.
@@ -315,5 +315,10 @@
 {
     ARM_COMPUTE_UNUSED(kernel);
 }
+
+void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
+{
+    ARM_COMPUTE_UNUSED(kernel, tensors);
+}
 } // namespace tuners
 } // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/CL/tuners/CLLWSList.cpp b/src/runtime/CL/tuners/CLLWSList.cpp
index 30fd558..c537f15 100644
--- a/src/runtime/CL/tuners/CLLWSList.cpp
+++ b/src/runtime/CL/tuners/CLLWSList.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CL/tuners/MidgardTuner.cpp b/src/runtime/CL/tuners/MidgardTuner.cpp
index cae3123..e49e155 100644
--- a/src/runtime/CL/tuners/MidgardTuner.cpp
+++ b/src/runtime/CL/tuners/MidgardTuner.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -73,5 +73,10 @@
 {
     ARM_COMPUTE_UNUSED(kernel);
 }
+
+void MidgardTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
+{
+    ARM_COMPUTE_UNUSED(kernel, tensors);
+}
 } // namespace tuners
 } // namespace arm_compute

diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 0a03497..55f62c1 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -95,10 +95,10 @@
 
     // nt = sqrt(max_threads * (m / n) )
     const unsigned adjusted = std::round(
-                    std::sqrt(max_threads * ratio));
+                                  std::sqrt(max_threads * ratio));
 
     //find the nearest factor of max_threads
-    for(unsigned i = 0; i!= adjusted; ++i)
+    for(unsigned i = 0; i != adjusted; ++i)
     {
         //try down
         const unsigned adj_down = adjusted - i;
@@ -118,11 +118,11 @@
     //we didn't find anything so lets bail out with maxes biased to the largest dimension
     if(m > n)
     {
-         return{ std::min<unsigned>(m, max_threads), 1 };
+        return { std::min<unsigned>(m, max_threads), 1 };
     }
     else
     {
-        return{ 1, std::min<unsigned>(n, max_threads) };
+        return { 1, std::min<unsigned>(n, max_threads) };
     }
 }
 
@@ -145,38 +145,30 @@
     while(feeder.get_next(workload_index));
 }
 
-} //namespace
-
-struct CPPScheduler::Impl final
+void set_thread_affinity(int core_id)
 {
-    explicit Impl(unsigned int thread_hint)
-        : _num_threads(thread_hint), _threads(_num_threads - 1)
+    if(core_id < 0)
     {
-    }
-    void set_num_threads(unsigned int num_threads, unsigned int thead_hint)
-    {
-        _num_threads = num_threads == 0 ? thead_hint : num_threads;
-        _threads.resize(_num_threads - 1);
-    }
-    unsigned int num_threads() const
-    {
-        return _num_threads;
+        return;
     }
 
-    void run_workloads(std::vector<IScheduler::Workload> &workloads);
+    cpu_set_t set;
+    CPU_ZERO(&set);
+    CPU_SET(core_id, &set);
+    ARM_COMPUTE_EXIT_ON_MSG(sched_setaffinity(0, sizeof(set), &set),
+                            "Error setting thread affinity");
+}
 
-    class Thread;
-
-    unsigned int       _num_threads;
-    std::list<Thread>  _threads;
-    arm_compute::Mutex _run_workloads_mutex{};
-};
-
-class CPPScheduler::Impl::Thread final
+class Thread final
 {
 public:
-    /** Start a new thread. */
-    Thread();
+    /** Start a new thread
+     *
+     * Thread will be pinned to a given core id if value is non-negative
+     *
+     * @param[in] core_pin Core id to pin the thread on. If negative no thread pinning will take place
+     */
+    explicit Thread(int core_pin = -1);
 
     Thread(const Thread &) = delete;
     Thread &operator=(const Thread &) = delete;
@@ -212,14 +204,16 @@
     bool                               _wait_for_work{ false };
     bool                               _job_complete{ true };
     std::exception_ptr                 _current_exception{ nullptr };
+    int                                _core_pin{ -1 };
 };
 
-CPPScheduler::Impl::Thread::Thread()
+Thread::Thread(int core_pin)
+    : _core_pin(core_pin)
 {
     _thread = std::thread(&Thread::worker_thread, this);
 }
 
-CPPScheduler::Impl::Thread::~Thread()
+Thread::~Thread()
 {
     // Make sure worker thread has ended
     if(_thread.joinable())
@@ -230,7 +224,7 @@
     }
 }
 
-void CPPScheduler::Impl::Thread::start(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info)
+void Thread::start(std::vector<IScheduler::Workload> *workloads, ThreadFeeder &feeder, const ThreadInfo &info)
 {
     _workloads = workloads;
     _feeder    = &feeder;
@@ -243,7 +237,7 @@
     _cv.notify_one();
 }
 
-void CPPScheduler::Impl::Thread::wait()
+void Thread::wait()
 {
     {
         std::unique_lock<std::mutex> lock(_m);
@@ -256,8 +250,10 @@
     }
 }
 
-void CPPScheduler::Impl::Thread::worker_thread()
+void Thread::worker_thread()
 {
+    set_thread_affinity(_core_pin);
+
     while(true)
     {
         std::unique_lock<std::mutex> lock(_m);
@@ -290,6 +286,44 @@
         _cv.notify_one();
     }
 }
+} //namespace
+
+struct CPPScheduler::Impl final
+{
+    explicit Impl(unsigned int thread_hint)
+        : _num_threads(thread_hint), _threads(_num_threads - 1)
+    {
+    }
+    void set_num_threads(unsigned int num_threads, unsigned int thread_hint)
+    {
+        _num_threads = num_threads == 0 ? thread_hint : num_threads;
+        _threads.resize(_num_threads - 1);
+    }
+    void set_num_threads_with_affinity(unsigned int num_threads, unsigned int thread_hint, BindFunc func)
+    {
+        _num_threads = num_threads == 0 ? thread_hint : num_threads;
+
+        // Set affinity on main thread
+        set_thread_affinity(func(0, thread_hint));
+
+        // Set affinity on worked threads
+        _threads.clear();
+        for(auto i = 1U; i < _num_threads; ++i)
+        {
+            _threads.emplace_back(func(i, thread_hint));
+        }
+    }
+    unsigned int num_threads() const
+    {
+        return _num_threads;
+    }
+
+    void run_workloads(std::vector<IScheduler::Workload> &workloads);
+
+    unsigned int       _num_threads;
+    std::list<Thread>  _threads;
+    arm_compute::Mutex _run_workloads_mutex{};
+};
 
 /*
  * This singleton has been deprecated and will be removed in the next release
@@ -314,6 +348,13 @@
     _impl->set_num_threads(num_threads, num_threads_hint());
 }
 
+void CPPScheduler::set_num_threads_with_affinity(unsigned int num_threads, BindFunc func)
+{
+    // No changes in the number of threads while current workloads are running
+    arm_compute::lock_guard<std::mutex> lock(_impl->_run_workloads_mutex);
+    _impl->set_num_threads_with_affinity(num_threads, num_threads_hint(), func);
+}
+
 unsigned int CPPScheduler::num_threads() const
 {
     return _impl->num_threads();
@@ -364,11 +405,11 @@
 }
 #endif /* DOXYGEN_SKIP_THIS */
 
-void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
+void CPPScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
-    const Window      &max_window     = kernel->window();
+    const Window &max_window = kernel->window();
 
     if(hints.split_dimension() == IScheduler::split_dimensions_all)
     {
@@ -379,34 +420,32 @@
         const std::size_t m = max_window.num_iterations(Window::DimX);
         const std::size_t n = max_window.num_iterations(Window::DimY);
 
-       //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
+        //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
         unsigned m_threads, n_threads;
         std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n);
 
         std::vector<IScheduler::Workload> workloads;
-        for(unsigned int ni  = 0; ni != n_threads; ++ni)
+        for(unsigned int ni = 0; ni != n_threads; ++ni)
         {
-            for(unsigned int mi  = 0; mi != m_threads; ++mi)
+            for(unsigned int mi = 0; mi != m_threads; ++mi)
             {
                 workloads.push_back(
-                    [ ni, mi, m_threads, n_threads, &max_window, &kernel ]
-                    (const ThreadInfo & info)
-                    {
-                        //narrow the window to our mi-ni workload
-                        Window win = max_window.split_window(Window::DimX, mi, m_threads)
-                                               .split_window(Window::DimY, ni, n_threads);
+                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info)
+                {
+                    //narrow the window to our mi-ni workload
+                    Window win = max_window.split_window(Window::DimX, mi, m_threads)
+                                 .split_window(Window::DimY, ni, n_threads);
 
-                        win.validate();
+                    win.validate();
 
-                        Window thread_locator;
-                        thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
-                        thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+                    Window thread_locator;
+                    thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+                    thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
 
-                        thread_locator.validate();
+                    thread_locator.validate();
 
-                        kernel->run_nd(win, info, thread_locator);
-                    }
-                );
+                    kernel->run_nd(win, info, thread_locator);
+                });
             }
         }
         run_workloads(workloads);
@@ -425,7 +464,14 @@
         {
             ThreadInfo info;
             info.cpu_info = &_cpu_info;
-            kernel->run(max_window, info);
+            if(tensors.empty())
+            {
+                kernel->run(max_window, info);
+            }
+            else
+            {
+                kernel->run_op(tensors, max_window, info);
+            }
         }
         else
         {
@@ -449,15 +495,34 @@
             for(unsigned int t = 0; t < num_windows; t++)
             {
                 //Capture 't' by copy, all the other variables by reference:
-                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
                 {
                     Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
                     win.validate();
-                    kernel->run(win, info);
+
+                    if(tensors.empty())
+                    {
+                        kernel->run(win, info);
+                    }
+                    else
+                    {
+                        kernel->run_op(tensors, win, info);
+                    }
                 };
             }
             run_workloads(workloads);
         }
     }
 }
+
+void CPPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors)
+{
+    schedule_common(kernel, hints, tensors);
+}
+
+void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
+{
+    ITensorPack tensors;
+    schedule_common(kernel, hints, tensors);
+}
 } // namespace arm_compute

diff --git a/src/runtime/CPP/ICPPSimpleFunction.cpp b/src/runtime/CPP/ICPPSimpleFunction.cpp
index 42a2d22..f4fef11 100644
--- a/src/runtime/CPP/ICPPSimpleFunction.cpp
+++ b/src/runtime/CPP/ICPPSimpleFunction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 152569f..6f67bc0 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,12 +37,26 @@
 
 void SingleThreadScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
-    ARM_COMPUTE_UNUSED(hints);
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+    if(num_iterations < 1)
+    {
+        return;
+    }
+
     ThreadInfo info;
     info.cpu_info = &_cpu_info;
     kernel->run(kernel->window(), info);
 }
 
+void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors)
+{
+    ARM_COMPUTE_UNUSED(hints);
+    ThreadInfo info;
+    info.cpu_info = &_cpu_info;
+    kernel->run_op(tensors, kernel->window(), info);
+}
+
 void SingleThreadScheduler::run_workloads(std::vector<Workload> &workloads)
 {
     ThreadInfo info;

diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
index 232f71d..b6803d0 100644
--- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 4ec0ab6..9d62733 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index b3fc9c7..3507a3a 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
index 8856191..f9d2bad 100644
--- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
+++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CPP/functions/CPPPermute.cpp b/src/runtime/CPP/functions/CPPPermute.cpp
index 1cdfe92..7ea1070 100644
--- a/src/runtime/CPP/functions/CPPPermute.cpp
+++ b/src/runtime/CPP/functions/CPPPermute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
index eb0d560..bd089ac 100644
--- a/src/runtime/CPP/functions/CPPTopKV.cpp
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CPP/functions/CPPUpsample.cpp b/src/runtime/CPP/functions/CPPUpsample.cpp
index a154b5e..7dfc3b8 100644
--- a/src/runtime/CPP/functions/CPPUpsample.cpp
+++ b/src/runtime/CPP/functions/CPPUpsample.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp
index d8f01a9..4d6caae 100644
--- a/src/runtime/CPUUtils.cpp
+++ b/src/runtime/CPUUtils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -73,6 +73,7 @@
     {
         case CPUModel::GENERIC_FP16_DOT:
         case CPUModel::A55r1:
+        case CPUModel::X1:
             return true;
         default:
             return false;
@@ -86,6 +87,7 @@
         case CPUModel::GENERIC_FP16:
         case CPUModel::GENERIC_FP16_DOT:
         case CPUModel::A55r1:
+        case CPUModel::X1:
             return true;
         default:
             return false;
@@ -121,6 +123,12 @@
                     model = CPUModel::A55r0;
                 }
                 break;
+            case 0xd44: // X1
+                model = CPUModel::X1;
+                break;
+            case 0xd09: // A73
+                model = CPUModel::A73;
+                break;
             case 0xd0a: // A75
                 if(variant != 0)
                 {

diff --git a/src/runtime/DeviceProperties.cpp b/src/runtime/DeviceProperties.cpp
index e88aa71..5d7ae02 100644
--- a/src/runtime/DeviceProperties.cpp
+++ b/src/runtime/DeviceProperties.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/Distribution1D.cpp b/src/runtime/Distribution1D.cpp
index 7080220..c19862c 100644
--- a/src/runtime/Distribution1D.cpp
+++ b/src/runtime/Distribution1D.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
index 70a1f4f..ec91027 100644
--- a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
+++ b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/GCHelpers.cpp b/src/runtime/GLES_COMPUTE/GCHelpers.cpp
index df2f4f5..f4378d0 100644
--- a/src/runtime/GLES_COMPUTE/GCHelpers.cpp
+++ b/src/runtime/GLES_COMPUTE/GCHelpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
index f1457c4..998f8a5 100644
--- a/src/runtime/GLES_COMPUTE/GCMemory.cpp
+++ b/src/runtime/GLES_COMPUTE/GCMemory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp b/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp
index 2ffd9f2..562854f 100644
--- a/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp
+++ b/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp b/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp
index 1c30af1..6599f52 100644
--- a/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp
+++ b/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
index a45d793..946d558 100644
--- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp
+++ b/src/runtime/GLES_COMPUTE/GCScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/GCTensor.cpp b/src/runtime/GLES_COMPUTE/GCTensor.cpp
index e05eb4c..a73c995 100644
--- a/src/runtime/GLES_COMPUTE/GCTensor.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
index 61523bc..ff96c3c 100644
--- a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
+++ b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp
index bb9239e..4bb6a99 100644
--- a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp
+++ b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp b/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp
index 5098dd7..1b13143 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp
index 4f5ee28..a7ec758 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp b/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp
index b0d8a3c..580f8d5 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
index cc5e8f4..7ec0e42 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
index 81e98f1..9e23974 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index 61c0740..0d0526d 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
index 9a9f30d..4ddd0ab 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
index 3bc3398..c2aa815 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp
index 6407464..661bf5f 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp b/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp
index d1d9874..080b5a2 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index d391edd..57a09ed 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index ddfe590..a5a26f4 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp
index cc37bf4..1366a13 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp
index af933fa..877f81a 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
index 8f60279..c4bf141 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
index 19fdc3d..3e677b5 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp b/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp
index 1075f0b..daf978f 100755
--- a/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
index accf60e..e4ccabc 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCScale.cpp b/src/runtime/GLES_COMPUTE/functions/GCScale.cpp
index f245c3e..dccbe99 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCScale.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCScale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,14 +29,19 @@
 #include "arm_compute/core/Validate.h"
 #include "support/MemorySupport.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void GCScale::configure(IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
                         bool align_corners)
 {
-    ARM_COMPUTE_UNUSED(use_padding, align_corners);
-    auto k = arm_compute::support::cpp14::make_unique<GCScaleKernel>();
-    k->configure(input, output, policy, border_mode == BorderMode::UNDEFINED, sampling_policy);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), border_mode, constant_border_value);
+    configure(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners });
 }
+
+void GCScale::configure(IGCTensor *input, IGCTensor *output, const ScaleKernelInfo &info)
+{
+    auto k = arm_compute::support::cpp14::make_unique<GCScaleKernel>();
+    k->configure(input, output, info);
+    _kernel = std::move(k);
+    _border_handler.configure(input, _kernel->border_size(), info.border_mode, info.constant_border_value);
+}
+} // namespace arm_compute

diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index 0645ae7..48d8cb5 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,20 +27,20 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 GCSoftmaxLayer::GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
 {
 }
 
-void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, size_t axis)
+void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, size_t reduce_end_axis)
 {
-    ARM_COMPUTE_UNUSED(beta, axis);
+    ARM_COMPUTE_UNUSED(beta, reduce_end_axis);
 
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON(beta != 1.0f);
-    ARM_COMPUTE_ERROR_ON_MSG(axis != 1, "Axis must be 1 for GLES");
+    ARM_COMPUTE_ERROR_ON_MSG(reduce_end_axis != 0, "Reduce_end_axis must be 0 for GLES");
 
     // Create intermediate tensors shapes
     _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
@@ -77,3 +77,5 @@
     GCScheduler::get().memory_barrier();
     GCScheduler::get().dispatch(_norm_kernel);
 }
+
+} // namespace arm_compute
\ No newline at end of file

diff --git a/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp b/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp
index 7ef07e9..4cbd2e3 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp b/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp
index 530f52a..da4471c 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/HOG.cpp b/src/runtime/HOG.cpp
index d312967..7492276 100644
--- a/src/runtime/HOG.cpp
+++ b/src/runtime/HOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/ILutAllocator.cpp b/src/runtime/ILutAllocator.cpp
index fb96163..8ffb074 100644
--- a/src/runtime/ILutAllocator.cpp
+++ b/src/runtime/ILutAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index b2edad0..6b961d7 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,6 +41,12 @@
     return _cpu_info;
 }
 
+void IScheduler::set_num_threads_with_affinity(unsigned int num_threads, BindFunc func)
+{
+    ARM_COMPUTE_UNUSED(num_threads, func);
+    ARM_COMPUTE_ERROR("Feature for affinity setting is not implemented");
+}
+
 unsigned int IScheduler::num_threads_hint() const
 {
     return _num_threads_hint;

diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index d0c9919..a6bc950 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/ITensorAllocator.cpp b/src/runtime/ITensorAllocator.cpp
index 087f324..ae648d4 100644
--- a/src/runtime/ITensorAllocator.cpp
+++ b/src/runtime/ITensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/IWeightsManager.cpp b/src/runtime/IWeightsManager.cpp
index b367b5f..081cd99 100644
--- a/src/runtime/IWeightsManager.cpp
+++ b/src/runtime/IWeightsManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/Lut.cpp b/src/runtime/Lut.cpp
index 1b3daf1..9c3d2d3 100644
--- a/src/runtime/Lut.cpp
+++ b/src/runtime/Lut.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/LutAllocator.cpp b/src/runtime/LutAllocator.cpp
index 4b77a45..cc0c24f 100644
--- a/src/runtime/LutAllocator.cpp
+++ b/src/runtime/LutAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/MEMUtils.cpp b/src/runtime/MEMUtils.cpp
index 054169a..8b39a0f 100644
--- a/src/runtime/MEMUtils.cpp
+++ b/src/runtime/MEMUtils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index c6b956d..ac0a325 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
index d9803a8..2e418ae 100644
--- a/src/runtime/MemoryManagerOnDemand.cpp
+++ b/src/runtime/MemoryManagerOnDemand.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/MultiHOG.cpp b/src/runtime/MultiHOG.cpp
index 1584e3a..d68b755 100644
--- a/src/runtime/MultiHOG.cpp
+++ b/src/runtime/MultiHOG.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/MultiImage.cpp b/src/runtime/MultiImage.cpp
index eec58d3..66e67ed 100644
--- a/src/runtime/MultiImage.cpp
+++ b/src/runtime/MultiImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp b/src/runtime/NEON/INEOperator.cpp
similarity index 60%
copy from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
copy to src/runtime/NEON/INEOperator.cpp
index 36f84d8..75068b1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_trans.hpp
+++ b/src/runtime/NEON/INEOperator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,37 +21,36 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#pragma once
+#include "arm_compute/runtime/NEON/INEOperator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
-#ifdef __aarch64__
+namespace arm_compute
+{
+namespace experimental
+{
+INEOperator::INEOperator(IRuntimeContext *ctx)
+    : _kernel(), _ctx(ctx), _workspace()
+{
+}
 
-namespace arm_gemm {
-
-// Actual kernel implementations
-void a64_sgemv_trans(const float *, const float *, float *, float, int, int, int);
-
-// Transposed SGEMV strategy class.
-class sgemv_trans {
-public:
-    typedef float operand_type;
-    typedef float result_type;
-
-    typedef void (*kern_type)(const float *, const float *, float *, float, int, int, int);
-
-    /* Kernel blocking parameters */
-    static unsigned int out_width() {
-        return 96;
+void INEOperator::run(ITensorPack &tensors)
+{
+    if(tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
     }
 
-    static unsigned int k_unroll() {
-        return 1;
-    }
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, tensors);
+}
 
-    kern_type kernel=a64_sgemv_trans;
+void INEOperator::prepare(ITensorPack &constants)
+{
+    ARM_COMPUTE_UNUSED(constants);
+}
 
-    sgemv_trans(const CPUInfo *ci) { UNUSED(ci); }
-};
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
+MemoryRequirements INEOperator::workspace() const
+{
+    return {};
+}
+} // namespace experimental
+} // namespace arm_compute

diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index 23d9872..cef2762 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
index 2cabee4..82316c4 100644
--- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
index 06b38a9..ec27820 100644
--- a/src/runtime/NEON/functions/NEAbsoluteDifference.cpp
+++ b/src/runtime/NEON/functions/NEAbsoluteDifference.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEAccumulate.cpp b/src/runtime/NEON/functions/NEAccumulate.cpp
index 47ea83d..662f8cc 100644
--- a/src/runtime/NEON/functions/NEAccumulate.cpp
+++ b/src/runtime/NEON/functions/NEAccumulate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index e4d1125..7f55edb 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,25 +23,71 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Tensor.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
-NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) // NOLINT
-    : INESimpleFunctionNoBorder(ctx)
+namespace experimental
 {
-}
-void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+void NEActivationLayer::configure(const ITensorInfo *input, ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernel>();
     k->configure(input, output, activation_info);
     _kernel = std::move(k);
 }
 
+Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
+{
+    return NEActivationLayerKernel::validate(input, output, activation_info);
+}
+} // namespace experimental
+
+struct NEActivationLayer::Impl
+{
+    const ITensor                                   *src{ nullptr };
+    ITensor                                         *dst{ nullptr };
+    IRuntimeContext                                 *ctx{ nullptr };
+    std::unique_ptr<experimental::NEActivationLayer> op{ nullptr };
+};
+
+NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx)
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+    _impl->ctx = ctx;
+}
+
+NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default;
+
+NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default;
+
+NEActivationLayer::~NEActivationLayer() = default;
+
+void NEActivationLayer::configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _impl->src = input;
+    _impl->dst = output == nullptr ? input : output;
+
+    _impl->op = arm_compute::support::cpp14::make_unique<experimental::NEActivationLayer>();
+    _impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info);
+}
+
 Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
-    return NEActivationLayerKernel::validate(input, output, act_info);
+    return experimental::NEActivationLayer::validate(input, output, act_info);
+}
+
+void NEActivationLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index a23061e..0664d3c 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index 06c71db..4453a01 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
-void NEArithmeticAddition::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+namespace experimental
+{
+void NEArithmeticAddition::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticAdditionKernel>();
@@ -43,4 +45,44 @@
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return NEArithmeticAdditionKernel::validate(input1, input2, output, policy);
 }
+} // namespace experimental
+
+struct NEArithmeticAddition::Impl
+{
+    const ITensor                                      *src_0{ nullptr };
+    const ITensor                                      *src_1{ nullptr };
+    ITensor                                            *dst{ nullptr };
+    std::unique_ptr<experimental::NEArithmeticAddition> op{ nullptr };
+};
+
+NEArithmeticAddition::NEArithmeticAddition()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default;
+NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default;
+NEArithmeticAddition::~NEArithmeticAddition()                                  = default;
+
+Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    return experimental::NEArithmeticAddition::validate(input1, input2, output, policy, act_info);
+}
+
+void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEArithmeticAddition>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), policy, act_info);
+}
+
+void NEArithmeticAddition::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 454adc3..1c95bbf 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,22 +31,14 @@
 
 namespace arm_compute
 {
-void NEArithmeticSubtraction::configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+namespace experimental
+{
+void NEArithmeticSubtraction::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticSubtractionKernel>();
     k->configure(input1, input2, output, policy);
     _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
 }
 
 Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
@@ -54,4 +46,44 @@
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return NEArithmeticSubtractionKernel::validate(input1, input2, output, policy);
 }
+} // namespace experimental
+
+struct NEArithmeticSubtraction::Impl
+{
+    const ITensor                                         *src_0{ nullptr };
+    const ITensor                                         *src_1{ nullptr };
+    ITensor                                               *dst{ nullptr };
+    std::unique_ptr<experimental::NEArithmeticSubtraction> op{ nullptr };
+};
+
+NEArithmeticSubtraction::NEArithmeticSubtraction()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default;
+NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default;
+NEArithmeticSubtraction::~NEArithmeticSubtraction()                                     = default;
+
+Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    return experimental::NEArithmeticSubtraction::validate(input1, input2, output, policy, act_info);
+}
+
+void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEArithmeticSubtraction>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), policy, act_info);
+}
+
+void NEArithmeticSubtraction::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index bb224db..5a593e9 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index a4db1fd..c06a8aa 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 98f4745..1d89308 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index 173b7d5..585b059 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index 64f1d82..bba866d 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index 28c1036..188fe3d 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
index 6f767c7..b1ecfaf 100644
--- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEBox3x3.cpp b/src/runtime/NEON/functions/NEBox3x3.cpp
index 096b226..a380377 100644
--- a/src/runtime/NEON/functions/NEBox3x3.cpp
+++ b/src/runtime/NEON/functions/NEBox3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index a57ea60..d7ec52c 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index 464a608..4b35110 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEChannelCombine.cpp b/src/runtime/NEON/functions/NEChannelCombine.cpp
index 37e92c2..e987951 100644
--- a/src/runtime/NEON/functions/NEChannelCombine.cpp
+++ b/src/runtime/NEON/functions/NEChannelCombine.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEChannelExtract.cpp b/src/runtime/NEON/functions/NEChannelExtract.cpp
index 37a9892..d78a8f8 100644
--- a/src/runtime/NEON/functions/NEChannelExtract.cpp
+++ b/src/runtime/NEON/functions/NEChannelExtract.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
index 46d7783..0392a92 100644
--- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NECol2Im.cpp b/src/runtime/NEON/functions/NECol2Im.cpp
index 262ba8f..e4fe36f 100644
--- a/src/runtime/NEON/functions/NECol2Im.cpp
+++ b/src/runtime/NEON/functions/NECol2Im.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEColorConvert.cpp b/src/runtime/NEON/functions/NEColorConvert.cpp
index fff7633..7befac7 100644
--- a/src/runtime/NEON/functions/NEColorConvert.cpp
+++ b/src/runtime/NEON/functions/NEColorConvert.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp b/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
index 4fb4e8b..cb89117 100644
--- a/src/runtime/NEON/functions/NEComputeAllAnchors.cpp
+++ b/src/runtime/NEON/functions/NEComputeAllAnchors.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index 61d41b4..8df4f4c 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,58 +39,31 @@
 
 namespace arm_compute
 {
-NEConcatenateLayer::NEConcatenateLayer()
-    : _concat_kernels(),
-      _num_inputs(0),
-      _axis(Window::DimX)
+namespace experimental
+{
+NEConcatenation::NEConcatenation()
+    : _concat_kernels(), _num_inputs(0), _axis(0)
 {
 }
 
-void NEConcatenateLayer::configure(std::vector<ITensor *> inputs_vector, ITensor *output, size_t axis)
-{
-    configure_internal(std::move(inputs_vector), output, axis);
-}
-
-void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output, size_t axis)
-{
-    configure_internal(std::move(inputs_vector), output, axis);
-}
-
-Status NEConcatenateLayer::validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
-{
-    return validate_internal(inputs_vector, output, axis);
-}
-
-Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
-{
-    return validate_internal(inputs_vector, output, axis);
-}
-
-template <typename TensorType, typename>
-void NEConcatenateLayer::configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output, size_t axis)
+void NEConcatenation::configure(const std::vector<const ITensorInfo *> &inputs_vector, ITensorInfo *output, size_t axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
+
     _axis       = axis;
     _num_inputs = inputs_vector.size();
 
-    std::vector<ITensorInfo *> inputs_vector_info;
-    inputs_vector_info.reserve(_num_inputs);
-    for(unsigned int i = 0; i < _num_inputs; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
-        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
-    }
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, _axis);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-    ARM_COMPUTE_ERROR_THROW_ON(NEConcatenateLayer::validate(inputs_vector_info, output->info(), axis));
+    auto_init_if_empty(*output, output_shape, 1, inputs_vector[0]->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(NEConcatenateLayer::validate(inputs_vector, output, axis));
 
     unsigned int offset = 0;
 
     for(unsigned int i = 0; i < _num_inputs; ++i)
     {
-        switch(_axis)
+        switch(axis)
         {
             case Window::DimX:
             {
@@ -123,12 +96,11 @@
             default:
                 ARM_COMPUTE_ERROR("Axis not supported");
         }
-        offset += inputs_vector.at(i)->info()->dimension(_axis);
+        offset += inputs_vector.at(i)->dimension(axis);
     }
 }
 
-template <typename TensorInfoType, typename>
-Status NEConcatenateLayer::validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status NEConcatenation::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON(inputs_vector.size() < 2);
@@ -174,11 +146,83 @@
     return Status{};
 }
 
+void NEConcatenation::run(ITensorPack &tensors)
+{
+    if(tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_inputs))
+    {
+        ARM_COMPUTE_ERROR("Configured with different number of inputs");
+    }
+
+    int i = 0;
+    for(auto &k : _concat_kernels)
+    {
+        ITensorPack pack;
+        pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
+        pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
+        NEScheduler::get().schedule_op(k.get(), Window::DimY, pack);
+        ++i;
+    }
+}
+} // namespace experimental
+
+struct NEConcatenateLayer::Impl
+{
+    std::vector<const ITensor *>                   srcs{};
+    ITensor                                       *dst{ nullptr };
+    unsigned int                                   num_inputs{ 0 };
+    unsigned int                                   axis{ 0 };
+    std::unique_ptr<experimental::NEConcatenation> op{ nullptr };
+};
+
+NEConcatenateLayer::NEConcatenateLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default;
+
+NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default;
+
+NEConcatenateLayer::~NEConcatenateLayer() = default;
+
+void NEConcatenateLayer::configure(std::vector<const ITensor *> inputs_vector, ITensor *output, size_t axis)
+{
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    _impl->srcs       = inputs_vector;
+    _impl->dst        = output;
+    _impl->axis       = axis;
+    _impl->num_inputs = inputs_vector.size();
+    _impl->op         = arm_compute::support::cpp14::make_unique<experimental::NEConcatenation>();
+
+    std::vector<const ITensorInfo *> inputs_vector_info;
+    for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
+        inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
+    }
+    _impl->op->configure(inputs_vector_info, _impl->dst->info(), axis);
+}
+
+Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+{
+    return experimental::NEConcatenation::validate(inputs_vector, output, axis);
+}
+
 void NEConcatenateLayer::run()
 {
-    for(auto &kernel : _concat_kernels)
+    ITensorPack pack;
+    for(unsigned i = 0; i < _impl->num_inputs; ++i)
     {
-        NEScheduler::get().schedule(kernel.get(), Window::DimY);
+        pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+
+    _impl->op->run(pack);
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index f65c035..f697efb 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index 255cb3d..8200a08 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 4a77991..491425c 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,7 +48,7 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
     ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math));
+                                                            enable_fast_math, num_groups));
 
     switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math))
     {
@@ -181,6 +181,39 @@
         {
             return ConvolutionMethod::GEMM;
         }
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        // This heuristics only applies to F16 data type on A55r1
+        if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
+        {
+            // Exclude known bad winograd configs (and defaults to GEMM)
+            const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
+            {
+                // Squeezenet_V1_1 fire2 and fire3
+                ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                // Squeezenet_V1_1 fire6 and fire7
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                // Squeezenet_V1_1 fire8 and fire9
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
+            };
+            const auto find_conv_config = [&](ConvolutionConfiguration c)
+            {
+                const PadStrideInfo info = std::get<3>(c);
+
+                return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
+                       && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
+                       && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+            };
+
+            bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
+                                          find_conv_config)
+                             != known_bad_winograd_f16_with_fastmath_configs.end();
+            if(found_bad)
+            {
+                return ConvolutionMethod::GEMM;
+            }
+        }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
     }
 }

diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index 55c4faf..a461c18 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index cc39d02..f6ed2ec 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -111,7 +111,7 @@
         NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
 
         // Scale the cropped image.
-        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false);
+        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false });
         _scaled_results[i]->allocator()->allocate();
         _scale[i]->run();
 

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index dd53fbb..dff3070 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index a2f890e..1ffcca0 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index 3569eec..e363f89 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 7214971..cfdf203 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 42a0ee0..a4a3a43 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
index 8118030..2499140 100644
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ b/src/runtime/NEON/functions/NEDerivative.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
index 0d97689..9e63800 100644
--- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
+++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEDilate.cpp b/src/runtime/NEON/functions/NEDilate.cpp
index 449147d..7f50386 100644
--- a/src/runtime/NEON/functions/NEDilate.cpp
+++ b/src/runtime/NEON/functions/NEDilate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 751a3fa..da7e771 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
index 926ae1f..d1f60c7 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperators.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperators.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,78 +32,70 @@
 
 namespace arm_compute
 {
-void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+namespace experimental
 {
-    ARM_COMPUTE_UNUSED(act_info);
+void NEElementwiseMax::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+{
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
     k->configure(ArithmeticOperation::MAX, input1, input2, output);
     _kernel = std::move(k);
 }
 
-Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return NEArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
 }
 
-void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseMin::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
     k->configure(ArithmeticOperation::MIN, input1, input2, output);
     _kernel = std::move(k);
 }
 
-Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return NEArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
 }
 
-void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseSquaredDiff::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
     k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
     _kernel = std::move(k);
 }
 
-Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return NEArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
 }
 
-void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseDivision::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEDivisionOperationKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
 
-Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return NEDivisionOperationKernel::validate(input1, input2, output);
 }
 
-void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwisePower::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEPowerOperationKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
 }
 
-Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return NEPowerOperationKernel::validate(input1, input2, output);
 }
 
 template <ComparisonOperation COP>
-void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output)
+void NEElementwiseComparisonStatic<COP>::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
     k->configure(COP, input1, input2, output);
@@ -116,7 +108,7 @@
     return NEComparisonOperationKernel::validate(COP, input1, input2, output);
 }
 
-void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op)
+void NEElementwiseComparison::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ComparisonOperation op)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
     k->configure(op, input1, input2, output);
@@ -135,4 +127,304 @@
 template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
 template class NEElementwiseComparisonStatic<ComparisonOperation::Less>;
 template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace experimental
+
+struct NEElementwiseMax::Impl
+{
+    const ITensor                                  *src_0{ nullptr };
+    const ITensor                                  *src_1{ nullptr };
+    ITensor                                        *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseMax> op{ nullptr };
+};
+
+NEElementwiseMax::NEElementwiseMax()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default;
+NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default;
+NEElementwiseMax::~NEElementwiseMax()                              = default;
+
+void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseMax>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return experimental::NEElementwiseMax::validate(input1, input2, output);
+}
+
+void NEElementwiseMax::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEElementwiseMin::Impl
+{
+    const ITensor                                  *src_0{ nullptr };
+    const ITensor                                  *src_1{ nullptr };
+    ITensor                                        *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseMin> op{ nullptr };
+};
+
+NEElementwiseMin::NEElementwiseMin()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default;
+NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default;
+NEElementwiseMin::~NEElementwiseMin()                              = default;
+
+void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseMin>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return experimental::NEElementwiseMin::validate(input1, input2, output);
+}
+
+void NEElementwiseMin::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEElementwiseSquaredDiff::Impl
+{
+    const ITensor                                          *src_0{ nullptr };
+    const ITensor                                          *src_1{ nullptr };
+    ITensor                                                *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseSquaredDiff> op{ nullptr };
+};
+
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default;
+NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default;
+NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff()                                      = default;
+
+void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseSquaredDiff>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return experimental::NEElementwiseSquaredDiff::validate(input1, input2, output);
+}
+
+void NEElementwiseSquaredDiff::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEElementwiseDivision::Impl
+{
+    const ITensor                                       *src_0{ nullptr };
+    const ITensor                                       *src_1{ nullptr };
+    ITensor                                             *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseDivision> op{ nullptr };
+};
+
+NEElementwiseDivision::NEElementwiseDivision()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default;
+NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default;
+NEElementwiseDivision::~NEElementwiseDivision()                                   = default;
+
+void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseDivision>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return experimental::NEElementwiseDivision::validate(input1, input2, output);
+}
+
+void NEElementwiseDivision::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEElementwisePower::Impl
+{
+    const ITensor                                    *src_0{ nullptr };
+    const ITensor                                    *src_1{ nullptr };
+    ITensor                                          *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwisePower> op{ nullptr };
+};
+
+NEElementwisePower::NEElementwisePower()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default;
+NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default;
+NEElementwisePower::~NEElementwisePower()                                = default;
+
+void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwisePower>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return experimental::NEElementwisePower::validate(input1, input2, output);
+}
+
+void NEElementwisePower::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+template <ComparisonOperation COP>
+struct NEElementwiseComparisonStatic<COP>::Impl
+{
+    const ITensor                                                    *src_0{ nullptr };
+    const ITensor                                                    *src_1{ nullptr };
+    ITensor                                                          *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseComparisonStatic<COP>> op{ nullptr };
+};
+
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation       COP>
+NEElementwiseComparisonStatic<COP> &NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation       COP>
+NEElementwiseComparisonStatic<COP>::~NEElementwiseComparisonStatic() = default;
+
+template <ComparisonOperation COP>
+void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseComparisonStatic<COP>>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+template <ComparisonOperation COP>
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return experimental::NEElementwiseComparisonStatic<COP>::validate(input1, input2, output);
+}
+
+template <ComparisonOperation COP>
+void                          NEElementwiseComparisonStatic<COP>::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEElementwiseComparison::Impl
+{
+    const ITensor                                         *src_0{ nullptr };
+    const ITensor                                         *src_1{ nullptr };
+    ITensor                                               *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseComparison> op{ nullptr };
+};
+
+NEElementwiseComparison::NEElementwiseComparison()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default;
+NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default;
+NEElementwiseComparison::~NEElementwiseComparison()                                     = default;
+
+void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseComparison>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), op);
+}
+
+Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+{
+    return experimental::NEElementwiseComparison::validate(input1, input2, output, op);
+}
+
+void NEElementwiseComparison::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+// Supported Specializations
+template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Less>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
index 80db027..cb4e3a0 100644
--- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
index 70b93ca..b3d5ad4 100644
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEErode.cpp b/src/runtime/NEON/functions/NEErode.cpp
index 4f773b7..a89993c 100644
--- a/src/runtime/NEON/functions/NEErode.cpp
+++ b/src/runtime/NEON/functions/NEErode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index 25ba1c8..744a915 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
index 2fea017..b63afe5 100644
--- a/src/runtime/NEON/functions/NEFFT2D.cpp
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index 0823007..cd68788 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index af35301..303c593 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index d507f7c..79fe175 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index 6b7a0fa..de2ef26 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index a28411c..936a70d 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index 98b9725..95b2497 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index e275bca..4dcf41e 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
index 68dc159..fd26bb4 100644
--- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 2bd459a..3b8ca44 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@
 {
 NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(),
-      _alpha_scale_func(nullptr), _add_bias_kernel(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
+      _alpha_scale_func(nullptr), _add_bias(), _activation_func(), _tmp_a(), _tmp_b(), _tmp_d(), _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_alpha_scale(false),
       _run_addition(false), _run_bias_addition(false), _run_activation(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
 {
 }
@@ -68,16 +68,7 @@
     if(run_optimised)
     {
         const ITensor *c_to_use = is_c_bias ? c : nullptr;
-        if(MEMInfo::get_policy() == MemoryPolicy::MINIMIZE)
-        {
-            GEMMInfo gemm_info_ntb = gemm_info;
-            gemm_info_ntb.set_pretranpose_B(false);
-            _asm_glue.configure(a, b, c_to_use, d, gemm_info_ntb);
-        }
-        else
-        {
-            _asm_glue.configure(a, b, c_to_use, d, gemm_info);
-        }
+        _asm_glue.configure(a, b, c_to_use, d, gemm_info);
         ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
 
         // Scale product by alpha
@@ -150,7 +141,7 @@
 
         if(_run_bias_addition)
         {
-            _add_bias_kernel.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
+            _add_bias.configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
             _tmp_d.allocator()->allocate();
         }
     }
@@ -267,7 +258,7 @@
 
         if(c != nullptr && gemm_info.reshape_b_only_on_first_run())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&tmp_output_info, c, output, ConvertPolicy::SATURATE));
         }
     }
 
@@ -320,7 +311,7 @@
         // Run bias addition kernel
         if(_run_bias_addition)
         {
-            NEScheduler::get().schedule(&_add_bias_kernel, Window::DimY);
+            _add_bias.run();
         }
     }
 

diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 24bd7d7..3b9dde2 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,10 +23,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 
+#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
+
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
 
+#include "src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -280,7 +284,7 @@
     //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
     //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
     {
-        const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm);
+        const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
         if(window_size < static_cast<unsigned int>(args._maxthreads))
         {
             _gemm_kernel_asm->set_nthreads(window_size);
@@ -404,7 +408,7 @@
     if(_workspace.buffer() != nullptr)
     {
         _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
-        const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm);
+        const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
         unsigned int       num_threads = NEScheduler::get().num_threads();
         if(window_size < num_threads)
         {
@@ -433,14 +437,20 @@
     {
         const int granule_threshold = 200;
         scheduling_hint             = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
-
     }
-    else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && _d->info()->data_type() == DataType::F32)
+    else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (_d->info()->data_type() == DataType::F32 || _d->info()->data_type() == DataType::F16
+                                                                                 || _d->info()->data_type() == DataType::U8 || _d->info()->data_type() == DataType::S8))
     {
         //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
         const int granule_threshold = 200;
         scheduling_hint             = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
     }
+    else if(_kernel_info.method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (_d->info()->data_type() == DataType::QASYMM8 || _d->info()->data_type() == DataType::QASYMM8_SIGNED))
+    {
+        //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
+        const int granule_threshold = 200;
+        scheduling_hint             = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+    }
 
     NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
 }
@@ -454,7 +464,7 @@
     const CPUInfo               &ci          = NEScheduler::get().cpu_info();
     unsigned int                 num_threads = NEScheduler::get().num_threads();
 
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, gemm_info.pretranpose_B());
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, activation, num_threads);
 
     // Create arm_gemm fallback
     auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
@@ -467,11 +477,12 @@
                            const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
                            IWeightsManager *weights_manager)
 {
+    ARM_COMPUTE_UNUSED(activation);
     INEGEMMWrapperKernel::Params p           = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
     const CPUInfo               &ci          = NEScheduler::get().cpu_info();
     unsigned int                 num_threads = NEScheduler::get().num_threads();
 
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, activation, num_threads, gemm_info.pretranpose_B());
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, activation, num_threads);
 
     // Create arm_gemm fallback
     auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
@@ -512,10 +523,12 @@
 
 Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_UNUSED(gemm_info, c);
+    ARM_COMPUTE_UNUSED(c);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.pretranpose_B());
 #ifndef __aarch64__
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
 #endif /* __aarch64__ */

diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index a41d23f..834a66a 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index 5692beb..ad306c3 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
index 087df19..6d52f2b 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index f0fa616..dada6d1 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,8 +117,18 @@
         {
             if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
             {
-                _asm_glue.configure(a_to_use, b, c, output, gemm_info);
-                _fused_assembly_path = _asm_glue.is_configured();
+                // Result shifts < 0 are not supported by asm kernels
+                const std::vector<int32_t> &shifts           = info.gemmlowp_output_stage().gemmlowp_shifts;
+                const bool                  is_asm_supported = info.gemmlowp_output_stage().gemmlowp_shift >= 0
+                                                               && std::all_of(shifts.cbegin(), shifts.cend(), [](int32_t val)
+                {
+                    return val >= 0;
+                });
+                if(is_asm_supported)
+                {
+                    _asm_glue.configure(a_to_use, b, c, output, gemm_info);
+                    _fused_assembly_path = _asm_glue.is_configured();
+                }
             }
             else
             {
@@ -327,22 +337,20 @@
     // Check if we need to run the optimized assembly kernel
     bool run_optimised             = false;
     bool run_optimised_requantized = false;
-    if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
-        run_optimised_requantized = run_optimised;
-
-        const UniformQuantizationInfo a_qinfo      = a_to_use->quantization_info().uniform();
-        const QuantizationInfo        b_qinfo      = b->quantization_info();
-        const UniformQuantizationInfo output_qinfo = output->quantization_info().uniform();
-        for(auto const s : b_qinfo.scale())
+        // Result shifts < 0 are not supported by asm kernels
+        const std::vector<int32_t> &shifts           = info.gemmlowp_output_stage().gemmlowp_shifts;
+        const bool                  is_asm_supported = info.gemmlowp_output_stage().gemmlowp_shift >= 0
+                                                       && std::all_of(shifts.cbegin(), shifts.cend(), [](int32_t val)
         {
-            const float fmultipler = a_qinfo.scale * s / output_qinfo.scale;
-            if(fmultipler > 1.f)
-            {
-                run_optimised_requantized = false;
-                break;
-            }
+            return val >= 0;
+        });
+
+        if(is_asm_supported)
+        {
+            run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
+            run_optimised_requantized = run_optimised;
         }
     }
     else
@@ -429,6 +437,9 @@
         {
             if(!run_optimised)
             {
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
                 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
             }
 
@@ -445,6 +456,9 @@
         {
             if(!run_optimised)
             {
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+
                 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
             }
             // Validate offset contribution kernel
@@ -574,7 +588,7 @@
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && _reshape_b_only_on_first_run)
+        if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
         {
             _vector_sum_col.allocator()->allocate();
             NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);

diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 43ca7b3..239a8e6 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,29 +33,6 @@
 
 namespace arm_compute
 {
-void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_offset         = result_offset;
-    info.gemmlowp_multiplier     = result_mult_int;
-    info.gemmlowp_shift          = result_shift;
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-
-    auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ScaleKernel>();
-    k->configure(input, bias, output, &info);
-    _kernel = std::move(k);
-}
-
-Status NEGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
-{
-    GEMMLowpOutputStageInfo info = GEMMLowpOutputStageInfo();
-    info.gemmlowp_min_bound      = min;
-    info.gemmlowp_max_bound      = max;
-
-    return NEGEMMLowpQuantizeDownInt32ScaleKernel::validate(input, bias, output, &info);
-}
-
 void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift,
                                                                     int result_offset_after_shift, int min, int max)
 {

diff --git a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
index c7a50a8..e807e86 100644
--- a/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp
+++ b/src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
index cad42a3..5238936 100644
--- a/src/runtime/NEON/functions/NEGather.cpp
+++ b/src/runtime/NEON/functions/NEGather.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEGaussian3x3.cpp b/src/runtime/NEON/functions/NEGaussian3x3.cpp
index 399d19d..fba49ed 100644
--- a/src/runtime/NEON/functions/NEGaussian3x3.cpp
+++ b/src/runtime/NEON/functions/NEGaussian3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index 3c7411e..99591f4 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index d08bf1e..ae883bc 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -168,7 +168,7 @@
             _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
             /* Configure scale */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED);
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED });
         }
 
         _tmp.allocate();

diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 82880ba..3d53778 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,9 +31,9 @@
 NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager),
       _permute_deltas_kernel(),
-      _flatten_deltas_kernel(),
+      _flatten_deltas(),
       _permute_scores_kernel(),
-      _flatten_scores_kernel(),
+      _flatten_scores(),
       _compute_anchors_kernel(),
       _bounding_box_kernel(),
       _pad_kernel(),
@@ -95,12 +95,12 @@
     {
         _memory_group.manage(&_deltas_permuted);
         _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+        _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+        _flatten_deltas.configure(deltas, &_deltas_flattened);
     }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
@@ -112,12 +112,12 @@
     {
         _memory_group.manage(&_scores_permuted);
         _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+        _flatten_scores.configure(&_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_scores_kernel.configure(scores, &_scores_flattened);
+        _flatten_scores.configure(scores, &_scores_flattened);
     }
 
     Tensor *anchors_to_use = &_all_anchors;
@@ -244,12 +244,12 @@
     }
 
     TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
     TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
     TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
     TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
@@ -327,8 +327,8 @@
         NEScheduler::get().schedule(&_permute_scores_kernel, Window::DimY);
     }
 
-    NEScheduler::get().schedule(&_flatten_deltas_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_flatten_scores_kernel, Window::DimY);
+    _flatten_deltas.run();
+    _flatten_scores.run();
 
     if(_is_qasymm8)
     {

diff --git a/src/runtime/NEON/functions/NEHOGDescriptor.cpp b/src/runtime/NEON/functions/NEHOGDescriptor.cpp
index 8efc091..10765f9 100644
--- a/src/runtime/NEON/functions/NEHOGDescriptor.cpp
+++ b/src/runtime/NEON/functions/NEHOGDescriptor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEHOGDetector.cpp b/src/runtime/NEON/functions/NEHOGDetector.cpp
index 95d7aae..21db5f8 100644
--- a/src/runtime/NEON/functions/NEHOGDetector.cpp
+++ b/src/runtime/NEON/functions/NEHOGDetector.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEHOGGradient.cpp b/src/runtime/NEON/functions/NEHOGGradient.cpp
index 2760632..8f3559a 100644
--- a/src/runtime/NEON/functions/NEHOGGradient.cpp
+++ b/src/runtime/NEON/functions/NEHOGGradient.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 572e427..e08b699 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index bf1e271..3c51eb2 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEHistogram.cpp b/src/runtime/NEON/functions/NEHistogram.cpp
index 6a672ed..39fad97 100644
--- a/src/runtime/NEON/functions/NEHistogram.cpp
+++ b/src/runtime/NEON/functions/NEHistogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index 3cb0dc1..99e5d3f 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
index d7cb7de..57d01ff 100644
--- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h"
 
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 namespace arm_compute
@@ -35,7 +36,8 @@
 
 void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon)
 {
-    const DataLayout data_layout = input->info()->data_layout();
+    const DataLayout data_layout       = input->info()->data_layout();
+    const auto       kernel_descriptor = InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true };
 
     // Configure Kernels
     _is_nchw = data_layout == DataLayout::NCHW;
@@ -49,7 +51,7 @@
         _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
         _permuted_input.info()->set_data_layout(DataLayout::NCHW);
 
-        _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
+        _normalization_kernel.configure(&_permuted_input, &_permuted_output, kernel_descriptor);
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
 
         _permute_output.configure(&_permuted_output, output != nullptr ? output : input, PermutationVector(2U, 0U, 1U));
@@ -58,13 +60,15 @@
     }
     else
     {
-        _normalization_kernel.configure(input, output, gamma, beta, epsilon);
+        _normalization_kernel.configure(input, output, kernel_descriptor);
     }
 }
 
 Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
 {
-    return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+    return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW),
+                                                        &output->clone()->set_data_layout(DataLayout::NCHW),
+                                                        InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true });
 }
 
 void NEInstanceNormalizationLayer::run()

diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index 845f3b0..8ab6bbd 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index 88ffdbf..04cf3a2 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index f9d445f..dca274a 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -347,7 +347,7 @@
     _copy_output.configure(output_state_out, output);
 
     // Vector for holding the tensors to store in scratch buffer
-    std::vector<ITensor *> scratch_inputs;
+    std::vector<const ITensor *> scratch_inputs;
     if(!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
@@ -464,17 +464,17 @@
 
     if(lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
     if(lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
     if(!lstm_params.has_cifg_opt())
@@ -498,21 +498,21 @@
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
             ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
         if(lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
             ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
@@ -522,18 +522,18 @@
     if(lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
     if(cell_threshold != 0.f)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
-                                                                                                                    cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -cell_threshold,
+                                                                                                              cell_threshold)));
     }
 
     // Validate output gate tmp
@@ -548,29 +548,29 @@
 
     if(lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
     }
     if(lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     if(lstm_params.has_projection())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
         if(projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(output_state_out, output_state_out,
-                                                                          ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out,
+                                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
         }
     }
 
@@ -579,7 +579,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(NECopyKernel::validate(output_state_out, output));
 
     // Validate scratch concatenation
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     if(!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
@@ -603,16 +603,16 @@
 
     if(_run_peephole_opt)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_forget_gate, Window::DimY);
+        _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
     if(_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
-        NEScheduler::get().schedule(&_pixelwise_mul_forget_gate_coeff, Window::DimY);
-        NEScheduler::get().schedule(&_accum_forget_gate_bias, Window::DimY);
+        _pixelwise_mul_forget_gate_coeff.run();
+        _accum_forget_gate_bias.run();
     }
-    NEScheduler::get().schedule(&_activation_forget_gate, Window::DimY);
+    _activation_forget_gate.run();
 
     if(_run_cifg_opt)
     {
@@ -624,7 +624,7 @@
         {
             std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
         }
-        NEScheduler::get().schedule(&_subtract_input_gate, Window::DimY);
+        _subtract_input_gate.run();
     }
     else
     {
@@ -632,62 +632,62 @@
 
         if(_run_peephole_opt)
         {
-            NEScheduler::get().schedule(&_pixelwise_mul_input_gate, Window::DimY);
+            _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
         if(_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
-            NEScheduler::get().schedule(&_pixelwise_mul_input_gate_coeff, Window::DimY);
-            NEScheduler::get().schedule(&_accum_input_gate_bias, Window::DimY);
+            _pixelwise_mul_input_gate_coeff.run();
+            _accum_input_gate_bias.run();
         }
-        NEScheduler::get().schedule(&_activation_input_gate, Window::DimY);
+        _activation_input_gate.run();
     }
 
     _fully_connected_cell_state.run();
     NEScheduler::get().schedule(&_transpose_cell_state, Window::DimY);
     _gemm_cell_state1.run();
-    NEScheduler::get().schedule(&_accum_cell_state1, Window::DimY);
+    _accum_cell_state1.run();
     if(_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
-        NEScheduler::get().schedule(&_pixelwise_mul_cell_gate_coeff, Window::DimY);
-        NEScheduler::get().schedule(&_accum_cell_gate_bias, Window::DimY);
+        _pixelwise_mul_cell_gate_coeff.run();
+        _accum_cell_gate_bias.run();
     }
-    NEScheduler::get().schedule(&_activation_cell_state, Window::DimY);
-    NEScheduler::get().schedule(&_pixelwise_mul_cell_state1, Window::DimY);
-    NEScheduler::get().schedule(&_pixelwise_mul_cell_state2, Window::DimY);
-    NEScheduler::get().schedule(&_accum_cell_state2, Window::DimY);
+    _activation_cell_state.run();
+    _pixelwise_mul_cell_state1.run();
+    _pixelwise_mul_cell_state2.run();
+    _accum_cell_state2.run();
 
     if(_perform_cell_clipping)
     {
-        NEScheduler::get().schedule(&_cell_clip, Window::DimY);
+        _cell_clip.run();
     }
 
     _fully_connected_output.run();
     if(_run_peephole_opt)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_output_state1, Window::DimY);
+        _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
     if(_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
-        NEScheduler::get().schedule(&_pixelwise_mul_output_gate_coeff, Window::DimY);
-        NEScheduler::get().schedule(&_accum_output_gate_bias, Window::DimY);
+        _pixelwise_mul_output_gate_coeff.run();
+        _accum_output_gate_bias.run();
     }
-    NEScheduler::get().schedule(&_activation_output, Window::DimY);
+    _activation_output.run();
 
-    NEScheduler::get().schedule(&_activation_output_state, Window::DimY);
-    NEScheduler::get().schedule(&_pixelwise_mul_output_state2, Window::DimY);
+    _activation_output_state.run();
+    _pixelwise_mul_output_state2.run();
 
     if(_has_projection_weights)
     {
         _fully_connected_output_state.run();
         if(_perform_projection_clipping)
         {
-            NEScheduler::get().schedule(&_projection_clip, Window::DimY);
+            _projection_clip.run();
         }
     }
 

diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index cdfc035..11989d3 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NELaplacianPyramid.cpp b/src/runtime/NEON/functions/NELaplacianPyramid.cpp
index 6b37029..4f0639b 100644
--- a/src/runtime/NEON/functions/NELaplacianPyramid.cpp
+++ b/src/runtime/NEON/functions/NELaplacianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 9f7588e..24755fc9 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -73,7 +73,7 @@
     // Scale levels n-1 to 1, and add levels n-2 to 0
     for(size_t l = 0; l < last_level; ++l)
     {
-        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value);
+        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value });
         _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
     }
 

diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index d08202d..af502be 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEMagnitude.cpp b/src/runtime/NEON/functions/NEMagnitude.cpp
index ff2cd49..5ca672e 100644
--- a/src/runtime/NEON/functions/NEMagnitude.cpp
+++ b/src/runtime/NEON/functions/NEMagnitude.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
new file mode 100644
index 0000000..9d3f34f
--- /dev/null
+++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp

@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEMaxUnpoolingLayer::NEMaxUnpoolingLayer()
+
+    : _memset_kernel(), _unpooling_layer_kernel()
+{
+}
+
+void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
+{
+    const PixelValue zero_value(0.f);
+    _memset_kernel.configure(output, zero_value);
+    _unpooling_layer_kernel.configure(input, indices, output, pool_info);
+}
+
+Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+{
+    return NEMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
+}
+
+void NEMaxUnpoolingLayer::run()
+{
+    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
+    NEScheduler::get().schedule(&_unpooling_layer_kernel, Window::DimY);
+}
+} /* namespace arm_compute */

diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
index 2304bc8..57363f0 100644
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
index fdf2980..a88732b 100644
--- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEMedian3x3.cpp b/src/runtime/NEON/functions/NEMedian3x3.cpp
index e24023c..2bbe8d3 100644
--- a/src/runtime/NEON/functions/NEMedian3x3.cpp
+++ b/src/runtime/NEON/functions/NEMedian3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEMinMaxLocation.cpp b/src/runtime/NEON/functions/NEMinMaxLocation.cpp
index 54e89ab..ca63937 100644
--- a/src/runtime/NEON/functions/NEMinMaxLocation.cpp
+++ b/src/runtime/NEON/functions/NEMinMaxLocation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016, 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NENonLinearFilter.cpp b/src/runtime/NEON/functions/NENonLinearFilter.cpp
index 6875d2e..b7c72ac 100644
--- a/src/runtime/NEON/functions/NENonLinearFilter.cpp
+++ b/src/runtime/NEON/functions/NENonLinearFilter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
index 5c1c3d2..4d9edf7 100644
--- a/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp
+++ b/src/runtime/NEON/functions/NENonMaximaSuppression3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index d52e928..10ee938 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,10 +30,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 NENormalizationLayer::NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_kernel(), _border_handler(), _input_squared()
+    : _memory_group(std::move(memory_manager)), _norm_kernel(), _multiply_f(), _input_squared()
 {
 }
 
@@ -49,8 +49,7 @@
 
     // Configure kernels
     _norm_kernel.configure(input, &_input_squared, output, norm_info);
-    _multiply_kernel.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-    _border_handler.configure(&_input_squared, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0.0f));
+    _multiply_f.configure(input, input, &_input_squared, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
 
     // Allocate the tensor once the configure methods have been called
     _input_squared.allocator()->allocate();
@@ -62,7 +61,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     return Status{};
 }
@@ -70,8 +69,7 @@
 void NENormalizationLayer::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-
-    NEScheduler::get().schedule(&_multiply_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_border_handler, Window::DimY);
+    _multiply_f.run();
     NEScheduler::get().schedule(&_norm_kernel, Window::DimY);
 }
+}
\ No newline at end of file

diff --git a/src/runtime/NEON/functions/NEOpticalFlow.cpp b/src/runtime/NEON/functions/NEOpticalFlow.cpp
index cb10ca8..c9e0748 100644
--- a/src/runtime/NEON/functions/NEOpticalFlow.cpp
+++ b/src/runtime/NEON/functions/NEOpticalFlow.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index 02dfc6f..f9393a4 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,15 +29,57 @@
 
 namespace arm_compute
 {
-void NEPReluLayer::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+namespace experimental
+{
+void NEPRelu::configure(const ITensorInfo *input, const ITensorInfo *alpha, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
     k->configure(ArithmeticOperation::PRELU, input, alpha, output);
     _kernel = std::move(k);
 }
 
-Status NEPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+Status NEPRelu::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
 {
     return NEArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
 }
+} // nsamespace experimental
+
+struct NEPReluLayer::Impl
+{
+    const ITensor                         *src_0{ nullptr };
+    const ITensor                         *src_1{ nullptr };
+    ITensor                               *dst{ nullptr };
+    std::unique_ptr<experimental::NEPRelu> op{ nullptr };
+};
+
+NEPReluLayer::NEPReluLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default;
+NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default;
+NEPReluLayer::~NEPReluLayer()                          = default;
+
+void NEPReluLayer::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+{
+    _impl->src_0 = input;
+    _impl->src_1 = alpha;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEPRelu>();
+    _impl->op->configure(input->info(), alpha->info(), output->info());
+}
+
+void NEPReluLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+Status NEPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+    return experimental::NEPRelu::validate(input, alpha, output);
+}
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 537eba7..21c349b 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,7 +117,7 @@
             const int32_t end_mask_after    = ends_after[i] < 0 ? ~0 : ~(1u << i);
 
             // Reflect the input values for the padding before and after the input.
-            std::vector<ITensor *> concat_vector;
+            std::vector<const ITensor *> concat_vector;
             if(_padding[i].first > 0)
             {
                 if(i < prev->info()->num_dimensions())

diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index cfd27da..698add8 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEPhase.cpp b/src/runtime/NEON/functions/NEPhase.cpp
index bb96f6d..8577961 100644
--- a/src/runtime/NEON/functions/NEPhase.cpp
+++ b/src/runtime/NEON/functions/NEPhase.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index eaf233b..4208878 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,23 +31,15 @@
 
 namespace arm_compute
 {
-void NEPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+namespace experimental
+{
+void NEPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
                                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEPixelWiseMultiplicationKernel>();
     k->configure(input1, input2, output, scale, overflow_policy, rounding_policy);
     _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
 }
 Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
                                            const ActivationLayerInfo &act_info)
@@ -56,22 +48,12 @@
     return NEPixelWiseMultiplicationKernel::validate(input1, input2, output, scale, overflow_policy, rounding_policy);
 }
 
-void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEComplexPixelWiseMultiplication::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEComplexPixelWiseMultiplicationKernel>();
     k->configure(input1, input2, output);
     _kernel = std::move(k);
-
-    if(output->info()->dimension(0) > 1)
-    {
-        ITensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
-
-        if(broadcasted_info->info()->dimension(0) == 1)
-        {
-            _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-        }
-    }
 }
 
 Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
@@ -79,5 +61,85 @@
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return NEComplexPixelWiseMultiplicationKernel::validate(input1, input2, output);
 }
+} // namespace experimental
 
+struct NEPixelWiseMultiplication::Impl
+{
+    const ITensor                                           *src_0{ nullptr };
+    const ITensor                                           *src_1{ nullptr };
+    ITensor                                                 *dst{ nullptr };
+    std::unique_ptr<experimental::NEPixelWiseMultiplication> op{ nullptr };
+};
+
+NEPixelWiseMultiplication::NEPixelWiseMultiplication()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEPixelWiseMultiplication::NEPixelWiseMultiplication(NEPixelWiseMultiplication &&) = default;
+NEPixelWiseMultiplication &NEPixelWiseMultiplication::operator=(NEPixelWiseMultiplication &&) = default;
+NEPixelWiseMultiplication::~NEPixelWiseMultiplication()                                       = default;
+
+Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+                                           const ActivationLayerInfo &act_info)
+{
+    return experimental::NEPixelWiseMultiplication::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+}
+
+void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+                                          const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEPixelWiseMultiplication>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+}
+
+void NEPixelWiseMultiplication::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+struct NEComplexPixelWiseMultiplication::Impl
+{
+    ITensor                                                        *src_0{ nullptr };
+    ITensor                                                        *src_1{ nullptr };
+    ITensor                                                        *dst{ nullptr };
+    std::unique_ptr<experimental::NEComplexPixelWiseMultiplication> op{ nullptr };
+};
+
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&) = default;
+NEComplexPixelWiseMultiplication &NEComplexPixelWiseMultiplication::operator=(NEComplexPixelWiseMultiplication &&) = default;
+NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication()                                              = default;
+
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    return experimental::NEComplexPixelWiseMultiplication::validate(input1, input2, output, act_info);
+}
+
+void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEComplexPixelWiseMultiplication>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), act_info);
+}
+
+void NEComplexPixelWiseMultiplication::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
+    pack.add_tensor(TensorType::ACL_SRC_1, _impl->src_1);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 12921cf..81bd00d 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,8 +49,8 @@
         case DataLayout::NCHW:
         {
             // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-            BorderMode border_mode = (pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-            PixelValue zero_value(0.f);
+            BorderMode border_mode = (!indices && pool_info.pool_type == PoolingType::MAX) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
+            PixelValue zero_value((indices) ? std::numeric_limits<int>::min() : 0.f);
             if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding)
             {
                 zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());

diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index 6e7d4ab..fda130b 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index beb180f..cb45b64 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -189,6 +189,10 @@
     if(_has_projection)
     {
         _projection_reduction.configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if(_projection_bias != nullptr)
+        {
+            _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+        }
     }
 
     // Pre-transpose weights to be used in GEMM.
@@ -353,7 +357,7 @@
         input_activation_input->allocator()->allocate();
     }
     // Cell.
-    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
     _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
@@ -388,7 +392,7 @@
 
     if(_has_peephole)
     {
-        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
@@ -422,7 +426,7 @@
 
     // Hidden.
     _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
-    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+    // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
@@ -612,6 +616,11 @@
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
                                                                                lstm_params.hidden_state_zero(),
                                                                                true)));
+        if(lstm_params.projection_bias() != nullptr)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE));
+        }
     }
 
     const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
@@ -644,6 +653,7 @@
     const bool has_layer_norm = lstm_params.use_layer_norm();
 
     // Forget gate.
+    ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
     const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
@@ -652,17 +662,17 @@
     const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
 
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
         const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
         ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
     if(has_layer_norm)
@@ -679,6 +689,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
+    ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
     const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
     const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
@@ -686,7 +697,7 @@
     const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
 
     if(has_layer_norm)
     {
@@ -703,7 +714,7 @@
     if(lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtractionKernel::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
@@ -714,6 +725,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
+        ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
         const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
         const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
         ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
@@ -721,16 +733,16 @@
         const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
         ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
 
         if(lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                                  RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
+                                                                            RoundingPolicy::TO_ZERO));
             const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
             ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
         if(has_layer_norm)
@@ -743,15 +755,16 @@
         ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
     if(quantized_cell_clip > 0)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
                                                                                                              quantized_cell_clip)));
     }
     // Output gate.
+    ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
     const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
     const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
@@ -759,17 +772,17 @@
     const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
     if(lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
-        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
+        // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                              RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+                                                                        RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
     if(has_layer_norm)
@@ -786,7 +799,9 @@
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplicationKernel::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
     ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
     gemmlowp_info.gemmlowp_offset = lstm_params.hidden_state_zero();
@@ -798,7 +813,7 @@
     if(lstm_params.has_projection())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.projection_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
         const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
         const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
@@ -822,7 +837,7 @@
             ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_out, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
 
         if(projection_tensor_copy_required)
         {
@@ -878,13 +893,13 @@
 
     _mm_recurrent_to_forget.run();
     _recurrent_to_forget_outstage.run();
-    NEScheduler::get().schedule(&_accumulate_input_recurrent_forget, Window::DimY);
+    _accumulate_input_recurrent_forget.run();
 
     if(_has_peephole)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_cell_to_forget, Window::DimY);
+        _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
-        NEScheduler::get().schedule(&_accumulate_cell_forget, Window::DimY);
+        _accumulate_cell_forget.run();
     }
 
     if(_has_layer_norm)
@@ -900,7 +915,7 @@
 
     _mm_recurrent_to_cell.run();
     _recurrent_to_cell_outstage.run();
-    NEScheduler::get().schedule(&_accumulate_input_recurrent_modulation, Window::DimY);
+    _accumulate_input_recurrent_modulation.run();
 
     if(_has_layer_norm)
     {
@@ -912,7 +927,7 @@
     // Input gate
     if(_has_cifg)
     {
-        NEScheduler::get().schedule(&_input_gate_sub, Window::DimY);
+        _input_gate_sub.run();
     }
     else
     {
@@ -920,13 +935,13 @@
         _input_to_input_outstage.run();
         _mm_recurrent_to_input.run();
         _recurrent_to_input_outstage.run();
-        NEScheduler::get().schedule(&_accumulate_input_recurrent_input, Window::DimY);
+        _accumulate_input_recurrent_input.run();
 
         if(_has_peephole)
         {
-            NEScheduler::get().schedule(&_pixelwise_mul_cell_to_input, Window::DimY);
+            _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
-            NEScheduler::get().schedule(&_accumulate_cell_input, Window::DimY);
+            _accumulate_cell_input.run();
         }
 
         if(_has_layer_norm)
@@ -938,9 +953,10 @@
     }
 
     // Cell.
-    NEScheduler::get().schedule(&_pixelwise_mul_forget_cell, Window::DimY);
-    NEScheduler::get().schedule(&_pixelwise_mul_input_cell, Window::DimY);
-    NEScheduler::get().schedule(&_add_forget_cell, Window::DimY);
+    _pixelwise_mul_forget_cell.run();
+    _pixelwise_mul_input_cell.run();
+    _add_forget_cell.run();
+
     if(_has_cell_clipping)
     {
         _cell_clip.run();
@@ -951,12 +967,12 @@
     _input_to_output_outstage.run();
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
-    NEScheduler::get().schedule(&_accumulate_input_recurrent_output, Window::DimY);
+    _accumulate_input_recurrent_output.run();
     if(_has_peephole)
     {
-        NEScheduler::get().schedule(&_pixelwise_mul_cell_to_output, Window::DimY);
+        _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
-        NEScheduler::get().schedule(&_accumulate_cell_to_output, Window::DimY);
+        _accumulate_cell_to_output.run();
     }
 
     if(_has_layer_norm)
@@ -968,7 +984,7 @@
 
     // Hidden.
     _hidden_tanh.run();
-    NEScheduler::get().schedule(&_pixelwise_mul_hidden, Window::DimY);
+    _pixelwise_mul_hidden.run();
     _hidden_outstage.run();
 
     // Projection.
@@ -982,7 +998,7 @@
             _projection_output_to_accumulate_copy.run();
         }
 
-        NEScheduler::get().schedule(&_accumulate_projection, Window::DimY);
+        _accumulate_projection.run();
 
         if(_projection_tensor_copy_required)
         {
@@ -1058,10 +1074,11 @@
 
         if(_has_projection)
         {
+            _projection_eff_bias.allocator()->allocate();
+            NEScheduler::get().schedule(&_projection_reduction, Window::DimY);
             if(_projection_bias != nullptr)
             {
-                _projection_eff_bias.allocator()->allocate();
-                NEScheduler::get().schedule(&_projection_reduction, Window::DimY);
+                _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
             }
 

diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 47cc8b0..c042705 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index 154b060..b7415bd 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,8 @@
 namespace arm_compute
 {
 NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(),
-      _add_output(), _is_prepared(false)
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+      _is_prepared(false)
 {
 }
 
@@ -43,6 +43,7 @@
                             const ITensorInfo *output, const ActivationLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
     const int idx_width  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
@@ -59,8 +60,8 @@
     auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
@@ -90,12 +91,12 @@
     _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
     _memory_group.manage(&_add_output);
 
-    _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
+    _add_f.configure(&_fully_connected_out, &_gemm_output, &_add_output, ConvertPolicy::SATURATE);
 
     _fully_connected_out.allocator()->allocate();
     _gemm_output.allocator()->allocate();
 
-    _activation_kernel.configure(&_add_output, hidden_state, info);
+    _activation.configure(&_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
 
     _copy_kernel.configure(hidden_state, output);
@@ -111,8 +112,8 @@
 
     _gemm_state_f.run();
 
-    NEScheduler::get().schedule(&_add_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
+    _add_f.run();
+    _activation.run();
 
     // copy hidden out to output
     NEScheduler::get().schedule(&_copy_kernel, Window::DimY);

diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
index 2299bf7..a3b116a 100644
--- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index 3aca4b7..4aecadb 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
index 977d502..138b458 100644
--- a/src/runtime/NEON/functions/NERange.cpp
+++ b/src/runtime/NEON/functions/NERange.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index d53ed31..079c7c6 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 80ebe67..853d0ed 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,7 @@
 } // namespace
 
 NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape_kernel(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
 {
 }
 
@@ -91,7 +91,7 @@
 
     if(is_reshape_required)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(output_internal, output));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output));
     }
 
     return Status{};
@@ -171,7 +171,7 @@
 
     if(_is_reshape_required)
     {
-        _reshape_kernel.configure(output_internal, output);
+        _reshape.configure(output_internal, output);
         _output_internal.allocator()->allocate();
     }
 }
@@ -185,7 +185,7 @@
     NEScheduler::get().schedule(&_reduction_kernel, _window_split);
     if(_is_reshape_required)
     {
-        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+        _reshape.run();
     }
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NERemap.cpp b/src/runtime/NEON/functions/NERemap.cpp
index 12c9f7b..d4e7f83 100644
--- a/src/runtime/NEON/functions/NERemap.cpp
+++ b/src/runtime/NEON/functions/NERemap.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
index dc8f5f1..dfe002a 100644
--- a/src/runtime/NEON/functions/NEReorgLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index 0a9f42d..c1c88c1 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,24 +25,68 @@
 
 #include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Types.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
+namespace experimental
+{
+void NEReshape::configure(const ITensorInfo *input, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
 }
 
+Status NEReshape::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    return arm_compute::NEReshapeLayerKernel::validate(input, output);
+}
+} // namespace experimental
+
+struct NEReshapeLayer::Impl
+{
+    const ITensor                           *src{ nullptr };
+    ITensor                                 *dst{ nullptr };
+    std::unique_ptr<experimental::NEReshape> op{ nullptr };
+};
+
+NEReshapeLayer::NEReshapeLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+
+NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default;
+
+NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default;
+
+NEReshapeLayer::~NEReshapeLayer() = default;
+
+void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::NEReshape>();
+    _impl->op->configure(input->info(), output->info());
+}
+
 Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(experimental::NEReshape::validate(input, output));
 
     return Status{};
 }
+
+void NEReshapeLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
index a950826..c60c84e 100644
--- a/src/runtime/NEON/functions/NEReverse.cpp
+++ b/src/runtime/NEON/functions/NEReverse.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index f1e9a87..424049f 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,18 +30,21 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 
+#include "src/core/utils/ScaleUtils.h"
+
 #include <cmath>
 #include <cstddef>
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy)
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy, bool align_corners)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
     ARM_COMPUTE_UNUSED(sampling_policy);
@@ -82,7 +85,8 @@
 
         execute_window_loop(win, [&](const Coordinates & id)
         {
-            const size_t in_xi = std::floor((id.x() + sampling_offset) * wr);
+            const float float_in_xi = (id.x() + sampling_offset) * wr;
+            const auto  in_xi       = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
 
             *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
         },
@@ -97,21 +101,17 @@
       _dy(),
       _scale_kernel(),
       _border_handler(),
-      _use_padding(true),
-      _align_corners(false)
+      _use_padding(true)
 {
 }
 
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
-                        bool align_corners)
+void NEScale::configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), policy, border_mode, constant_border_value, sampling_policy, use_padding));
+    ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), info));
 
-    _use_padding   = use_padding;
-    _align_corners = policy == InterpolationPolicy::BILINEAR
-                     && sampling_policy == SamplingPolicy::TOP_LEFT
-                     && align_corners;
+    _use_padding                     = info.use_padding;
+    const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
 
     // Get data layout and width/height indices
     const DataLayout data_layout = input->info()->data_layout();
@@ -122,32 +122,29 @@
     const TensorShape shape(output->info()->dimension(idx_width), output->info()->dimension(idx_height));
 
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = arm_compute::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners);
-    const auto hr = arm_compute::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
 
     // Get the element size of the input image
     const size_t input_element_size = input->info()->element_size();
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
-    {
-        policy = InterpolationPolicy::NEAREST_NEIGHBOR;
-    }
+    const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
 
-    switch(policy)
+    switch(policy_to_use)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding);
+            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, info);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
 
             // Pre-compute offsets for nearest interpolation
-            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, sampling_policy);
+            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, info.sampling_policy, is_align_corners_used);
             break;
         }
         case InterpolationPolicy::BILINEAR:
@@ -159,7 +156,7 @@
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners);
+            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, info);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -167,29 +164,33 @@
             _dy.allocator()->allocate();
 
             // Pre-compute dx, dy and offsets for bilinear interpolation
-            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, sampling_policy);
+            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, info.sampling_policy, is_align_corners_used);
             break;
         }
         case InterpolationPolicy::AREA:
         {
-            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, policy, border_mode, constant_border_value);
+            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, info);
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
-    if(use_padding)
+    if(info.use_padding)
     {
-        _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
+        _border_handler.configure(input, _scale_kernel.border_size(), info.border_mode, info.constant_border_value);
     }
 }
 
-Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
-                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
+void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
+                        bool align_corners)
+{
+    configure(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners });
+}
+
+Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
-    ARM_COMPUTE_UNUSED(border_mode, constant_border_value);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
 
     ITensorInfo *offsets = nullptr;
     ITensorInfo *dx      = nullptr;
@@ -207,7 +208,7 @@
     TensorInfo tensor_info_dx(shape, Format::F32);
     TensorInfo tensor_info_dy(shape, Format::F32);
 
-    switch(policy)
+    switch(info.interpolation_policy)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
             offsets = &tensor_info_offsets;
@@ -221,8 +222,14 @@
             break;
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(),
-                                                        policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(), info));
+    return Status{};
+}
+
+Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
+                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(NEScale::validate(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners }));
     return Status{};
 }
 
@@ -234,3 +241,4 @@
     }
     NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
 }
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEScharr3x3.cpp b/src/runtime/NEON/functions/NEScharr3x3.cpp
index b7a99ff..bf787e1 100644
--- a/src/runtime/NEON/functions/NEScharr3x3.cpp
+++ b/src/runtime/NEON/functions/NEScharr3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
index 8587f7f..8def123 100644
--- a/src/runtime/NEON/functions/NESelect.cpp
+++ b/src/runtime/NEON/functions/NESelect.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
index a4b0dff..b0cafae 100644
--- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
+++ b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
index 5da8896..2bacf2e 100644
--- a/src/runtime/NEON/functions/NESlice.cpp
+++ b/src/runtime/NEON/functions/NESlice.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,7 +33,9 @@
 
 namespace arm_compute
 {
-void NESlice::configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends)
+namespace experimental
+{
+void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -60,4 +62,42 @@
 
     return NEStridedSliceKernel::validate(input, output, starts, ends, BiStrides(), 0, slice_end_mask, 0);
 }
+} // namespace experimental
+
+struct NESlice::Impl
+{
+    const ITensor                         *src{ nullptr };
+    ITensor                               *dst{ nullptr };
+    std::unique_ptr<experimental::NESlice> op{ nullptr };
+};
+
+NESlice::NESlice()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NESlice::NESlice(NESlice &&) = default;
+NESlice &NESlice::operator=(NESlice &&) = default;
+NESlice::~NESlice()                     = default;
+
+Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+{
+    return experimental::NESlice::validate(input, output, starts, ends);
+}
+
+void NESlice::configure(const ITensor *input, ITensor *output, const Coordinates &starts, const Coordinates &ends)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::NESlice>();
+    _impl->op->configure(input->info(), output->info(), starts, ends);
+}
+
+void NESlice::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NESobel3x3.cpp b/src/runtime/NEON/functions/NESobel3x3.cpp
index ca80ccd..cfd68d7 100644
--- a/src/runtime/NEON/functions/NESobel3x3.cpp
+++ b/src/runtime/NEON/functions/NESobel3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 2ddfee5..092c510 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index b47a37a..87ec81f 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 57d75af..750992f 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,42 +27,39 @@
 #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "utils/TypePrinter.h"
-
-#include <cfloat>
 
 namespace arm_compute
 {
 template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
-      _output_flattened(), _needs_flattening(false)
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_ptr(nullptr), _fill_border_kernel(), _reshape(), _max(), _tmp(), _input_flattened(), _output_flattened(),
+      _needs_flattening(false)
 {
 }
 
 template <bool IS_LOG>
-void NESoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, int32_t axis)
+void NESoftmaxLayerGeneric<IS_LOG>::configure_reshape_input_kernel(const ITensor *input, const ITensor *output, int32_t first_n_reduce_axes)
 {
     // Flatten the input
-    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), axis);
+    const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input->info(), first_n_reduce_axes);
 
     // Initialize the flat input
     _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
 
-    // If we need to flatten the input, we can use NEFlattenKernel or NEReshapeKernel
-    // If flattening on the third axes, we use NEFlattenKernel.
-    // In all other cases we have to use NEReshapeKernel
-    if(axis != 3)
+    // Note that the "other cases" include both:
+    //   1. first_n_reduce_axes < 3: Reduce the first 1 (no need to reduce) or 2 dimensions (inclusive)
+    //   2. first_n_reduce_axes == 4: Reduce all 4 dimensions. This can only be handled by NEReshapeKernel instead of NEFlattenKernel.
+    if(first_n_reduce_axes == 3)
     {
-        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayerKernel>();
-        reshape_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_kernel_ptr = std::move(reshape_kernel_ptr);
+        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayer>();
+        flatten_kernel_ptr->configure(input, &_input_flattened);
+        _flat_or_reshape_ptr = std::move(flatten_kernel_ptr);
     }
     else
     {
-        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayerKernel>();
-        flatten_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_kernel_ptr = std::move(flatten_kernel_ptr);
+        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayer>();
+        reshape_kernel_ptr->configure(input, &_input_flattened);
+        _flat_or_reshape_ptr = std::move(reshape_kernel_ptr);
     }
 
     // We need to init the output tensor here. Indeed, the reshape kernel expects
@@ -77,11 +74,11 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(NESoftmaxLayerGeneric::validate(input->info(), output->info(), beta, axis));
 
-    // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
-    axis = wrap_around(axis, static_cast<int32_t>(input->info()->num_dimensions()));
+    // Convert reduce-before axis (inclusive) to first n axes to reduce
+    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, static_cast<int32_t>(input->info()->num_dimensions()));
 
-    // We don't need flattening only in the case the input is 2D and axis is 1
-    _needs_flattening = axis != 1;
+    // We only need flattening when the number of axes to reduce is greater than 1
+    _needs_flattening = first_n_reduce_axes > 1;
 
     // If we are dealing with a 4D tensor, we will:
     // - Flatten the input, so that we end up with a [width*height*depth] * batches 2D tensor
@@ -93,7 +90,7 @@
         _memory_group.manage(&_input_flattened);
 
         // Configure  _flatten_kernel and _input_flattened
-        configure_reshape_input_kernel(input, output, axis);
+        configure_reshape_input_kernel(input, output, first_n_reduce_axes);
     }
 
     // We want to deal with a 2D input. Either it is the flattened version of the original input (4D case)
@@ -127,7 +124,7 @@
         _input_flattened.allocator()->allocate();
 
         // Reshape the flat output into the requested (4D) output
-        _reshape_kernel.configure(&_output_flattened, output);
+        _reshape.configure(&_output_flattened, output);
 
         // Allocate the intermediate flat tensors
         _output_flattened.allocator()->allocate();
@@ -151,10 +148,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Only up to 4 dimensions are supported");
     ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 0, "Only axis 0 supported");
     ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-input->num_dimensions()) || static_cast<int32_t>(input->num_dimensions()) <= axis);
 
-    // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
-    axis = wrap_around(axis, static_cast<int32_t>(input->num_dimensions()));
+    // Convert reduce-before axis (inclusive) to first n axes to reduce
+    size_t first_n_reduce_axes = dim_index_2_num_dims(axis, static_cast<int32_t>(input->num_dimensions()));
 
     // Create intermediate tensor info
     DataType         tmp_data_type = input->data_type();
@@ -165,20 +163,20 @@
     const TensorInfo tensor_info_max_sum(input->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(input->quantization_info()).set_is_resizable(true));
     const TensorInfo dont_care;
 
-    const bool needs_flattening = (axis != 1);
+    const bool needs_flattening = (first_n_reduce_axes > 1);
 
     if(needs_flattening)
     {
-        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, axis);
+        const TensorShape shape_flatten = misc::shape_calculator::compute_softmax_shape(input, first_n_reduce_axes);
         TensorInfo        tensor_info_flat(input->clone()->set_tensor_shape(shape_flatten).set_is_resizable(true));
 
-        if(axis != 3)
+        if(first_n_reduce_axes == 3)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, &tensor_info_flat));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &tensor_info_flat));
         }
         else
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &tensor_info_flat));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(input, &tensor_info_flat));
         }
     }
 
@@ -195,7 +193,7 @@
 
     if(_needs_flattening)
     {
-        NEScheduler::get().schedule(_flat_or_reshape_kernel_ptr.get(), Window::DimY);
+        _flat_or_reshape_ptr->run();
     }
 
     NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
@@ -204,11 +202,11 @@
 
     if(_needs_flattening)
     {
-        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+        _reshape.run();
     }
 }
 
 template class NESoftmaxLayerGeneric<false>;
 template class NESoftmaxLayerGeneric<true>;
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index 205bc91..97e793f 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
index 18d8291..3e1ec80 100644
--- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index 8131e47..db19bbb 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index 351497c..a99a95a 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
index c9be563..8bf81e8 100644
--- a/src/runtime/NEON/functions/NEStridedSlice.cpp
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,13 +23,16 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
 #include "arm_compute/core/Types.h"
 #include "support/MemorySupport.h"
 
 namespace arm_compute
 {
-void NEStridedSlice::configure(const ITensor *input, ITensor *output,
+namespace experimental
+{
+void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output,
                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
 {
@@ -44,4 +47,45 @@
 {
     return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
+} // namespace experimental
+
+struct NEStridedSlice::Impl
+{
+    const ITensor                                *src{ nullptr };
+    ITensor                                      *dst{ nullptr };
+    std::unique_ptr<experimental::NEStridedSlice> op{ nullptr };
+};
+
+NEStridedSlice::NEStridedSlice()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default;
+NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default;
+NEStridedSlice::~NEStridedSlice()                            = default;
+
+void NEStridedSlice::configure(const ITensor *input, ITensor *output,
+                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = arm_compute::support::cpp14::make_unique<experimental::NEStridedSlice>();
+    _impl->op->configure(input->info(), output->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
+
+void NEStridedSlice::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
+}
+
+Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
+                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+{
+    return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+}
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NETableLookup.cpp b/src/runtime/NEON/functions/NETableLookup.cpp
index 44cbbc8..b8d765f 100644
--- a/src/runtime/NEON/functions/NETableLookup.cpp
+++ b/src/runtime/NEON/functions/NETableLookup.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEThreshold.cpp b/src/runtime/NEON/functions/NEThreshold.cpp
index f4fd857..e21511e 100644
--- a/src/runtime/NEON/functions/NEThreshold.cpp
+++ b/src/runtime/NEON/functions/NEThreshold.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,11 +28,22 @@
 
 #include <utility>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 void NEThreshold::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
 {
+    configure(input, output, ThresholdKernelInfo(threshold, false_value, true_value, type, upper));
+}
+
+void NEThreshold::configure(const ITensor *input, ITensor *output, const ThresholdKernelInfo &info)
+{
     auto k = arm_compute::support::cpp14::make_unique<NEThresholdKernel>();
-    k->configure(input, output, threshold, false_value, true_value, type, upper);
+    k->configure(input, output, info);
     _kernel = std::move(k);
 }
+
+Status NEThreshold::validate(const ITensorInfo *input, const ITensorInfo *output, const ThresholdKernelInfo &info)
+{
+    return NEThresholdKernel::validate(input, output, info);
+}
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
index 6bf8eaa..6fda3a5 100644
--- a/src/runtime/NEON/functions/NETile.cpp
+++ b/src/runtime/NEON/functions/NETile.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 21d4963..88d1672 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 21f35f8..50596db 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEUpsampleLayer.cpp b/src/runtime/NEON/functions/NEUpsampleLayer.cpp
index 9be96af..58c050f 100644
--- a/src/runtime/NEON/functions/NEUpsampleLayer.cpp
+++ b/src/runtime/NEON/functions/NEUpsampleLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEWarpAffine.cpp b/src/runtime/NEON/functions/NEWarpAffine.cpp
index 469ca65..ec2c688 100644
--- a/src/runtime/NEON/functions/NEWarpAffine.cpp
+++ b/src/runtime/NEON/functions/NEWarpAffine.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEWarpPerspective.cpp b/src/runtime/NEON/functions/NEWarpPerspective.cpp
index ac5edca..bf361b8 100644
--- a/src/runtime/NEON/functions/NEWarpPerspective.cpp
+++ b/src/runtime/NEON/functions/NEWarpPerspective.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index d567a18..1bad310 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,16 +25,16 @@
 
 #include "arm_compute/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 #include "support/MemorySupport.h"
 
 #include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp"
+#include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
 
 namespace arm_compute
 {
@@ -62,7 +62,7 @@
         }
     }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    else if(input->data_type() == DataType::F32)
+    else if(input->data_type() == DataType::F16)
     {
         ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<__fp16, 4, 4, 3, 3>::validate(input, input0, winograd_info)));
         ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<__fp16, 4, 4, 3, 3>::validate(weights, input1, winograd_info)));

diff --git a/src/runtime/NEON/functions/NEYOLOLayer.cpp b/src/runtime/NEON/functions/NEYOLOLayer.cpp
index cef6246..233afb7 100644
--- a/src/runtime/NEON/functions/NEYOLOLayer.cpp
+++ b/src/runtime/NEON/functions/NEYOLOLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
index e0094f4..73a7caa 100644
--- a/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index f67f06f..11448e5 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -83,6 +83,39 @@
     }
 }
 
+void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
+    ARM_COMPUTE_ERROR_ON_MSG(hints.strategy() == StrategyHint::DYNAMIC,
+                             "Dynamic scheduling is not supported in OMPScheduler");
+
+    const Window      &max_window     = kernel->window();
+    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+
+    if(!kernel->is_parallelisable() || num_threads == 1)
+    {
+        ThreadInfo info;
+        info.cpu_info = &_cpu_info;
+        kernel->run_op(tensors, max_window, info);
+    }
+    else
+    {
+        const unsigned int                num_windows = num_threads;
+        std::vector<IScheduler::Workload> workloads(num_windows);
+        for(unsigned int t = 0; t < num_windows; t++)
+        {
+            //Capture 't' by copy, all the other variables by reference:
+            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
+            {
+                Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
+                win.validate();
+                kernel->run_op(tensors, win, info);
+            };
+        }
+        run_workloads(workloads);
+    }
+}
 #ifndef DOXYGEN_SKIP_THIS
 void OMPScheduler::run_workloads(std::vector<arm_compute::IScheduler::Workload> &workloads)
 {

diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index 3133202..3bd8b02 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index c8381a1..677c55c 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/OperatorTensor.cpp b/src/runtime/OperatorTensor.cpp
new file mode 100644
index 0000000..5a35154
--- /dev/null
+++ b/src/runtime/OperatorTensor.cpp

@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/OperatorTensor.h"
+#include "arm_compute/runtime/MemoryRegion.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+OperatorTensor::OperatorTensor(ITensorInfo *info, IMemory *memory)
+    : _info(info), _memory(memory), _mem_type(MemoryType::CPU)
+{
+}
+
+ITensorInfo *OperatorTensor::info() const
+{
+    return _info;
+}
+
+ITensorInfo *OperatorTensor::info()
+{
+    return _info;
+}
+
+uint8_t *OperatorTensor::buffer() const
+{
+    switch(_mem_type)
+    {
+        case MemoryType::CPU:
+            return (uint8_t *)dynamic_cast<MemoryRegion *>(_memory->region())->buffer();
+        default:
+            ARM_COMPUTE_ERROR("Memory type not supported.");
+    }
+}
+} // namespace experimental
+} // namespace arm_compute

diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 455f969..19ed257 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/Pyramid.cpp b/src/runtime/Pyramid.cpp
index 16a91a8..5664b5f 100644
--- a/src/runtime/Pyramid.cpp
+++ b/src/runtime/Pyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/RuntimeContext.cpp b/src/runtime/RuntimeContext.cpp
index 308e278..504a74c 100644
--- a/src/runtime/RuntimeContext.cpp
+++ b/src/runtime/RuntimeContext.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index 380ad90..4063cc1 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/SchedulerFactory.cpp b/src/runtime/SchedulerFactory.cpp
index c6c9034..e395c2e 100644
--- a/src/runtime/SchedulerFactory.cpp
+++ b/src/runtime/SchedulerFactory.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index b010a32..ae16c8b 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 8f7ecd6..6dcef9f 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index ffd5cc7..e8c5c49 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/TracePoint.cpp b/src/runtime/TracePoint.cpp
index 817d63b..a4228b2 100644
--- a/src/runtime/TracePoint.cpp
+++ b/src/runtime/TracePoint.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 2204ec1..534b421 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/support/Bfloat16.h b/support/Bfloat16.h
index 65805f2..d57d8ce 100644
--- a/support/Bfloat16.h
+++ b/support/Bfloat16.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/support/Half.h b/support/Half.h
index ae31be3..5bea26c 100644
--- a/support/Half.h
+++ b/support/Half.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/support/MemorySupport.h b/support/MemorySupport.h
index 5ae5501..a904f34 100644
--- a/support/MemorySupport.h
+++ b/support/MemorySupport.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/support/Mutex.h b/support/Mutex.h
index c398c57..6e68fa5 100644
--- a/support/Mutex.h
+++ b/support/Mutex.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/support/Semaphore.h b/support/Semaphore.h
index 85749e1..e182b53 100644
--- a/support/Semaphore.h
+++ b/support/Semaphore.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/support/StringSupport.h b/support/StringSupport.h
index 2bd3987..5e237c7 100644
--- a/support/StringSupport.h
+++ b/support/StringSupport.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/support/ToolchainSupport.h b/support/ToolchainSupport.h
index 8235358..8bf7f98 100644
--- a/support/ToolchainSupport.h
+++ b/support/ToolchainSupport.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/AssetsLibrary.cpp b/tests/AssetsLibrary.cpp
index eafa631..62de78c 100644
--- a/tests/AssetsLibrary.cpp
+++ b/tests/AssetsLibrary.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h
index 16ebff7..d783b1f 100644
--- a/tests/AssetsLibrary.h
+++ b/tests/AssetsLibrary.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/CL/CLAccessor.h b/tests/CL/CLAccessor.h
index c0aee56..d127def 100644
--- a/tests/CL/CLAccessor.h
+++ b/tests/CL/CLAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/CL/CLArrayAccessor.h b/tests/CL/CLArrayAccessor.h
index 08c86b1..cfc6309 100644
--- a/tests/CL/CLArrayAccessor.h
+++ b/tests/CL/CLArrayAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/CL/CLHOGAccessor.h b/tests/CL/CLHOGAccessor.h
index 0d8751b..2b59495 100644
--- a/tests/CL/CLHOGAccessor.h
+++ b/tests/CL/CLHOGAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/CL/CLLutAccessor.h b/tests/CL/CLLutAccessor.h
index 57047e2..78cd85d 100644
--- a/tests/CL/CLLutAccessor.h
+++ b/tests/CL/CLLutAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/CL/Helper.h b/tests/CL/Helper.h
index 50d6162..e0d584c 100644
--- a/tests/CL/Helper.h
+++ b/tests/CL/Helper.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/GLES_COMPUTE/GCAccessor.h b/tests/GLES_COMPUTE/GCAccessor.h
index 45abd58..2a8733c 100644
--- a/tests/GLES_COMPUTE/GCAccessor.h
+++ b/tests/GLES_COMPUTE/GCAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/GLES_COMPUTE/Helper.h b/tests/GLES_COMPUTE/Helper.h
index 933dd26..fb1679e 100644
--- a/tests/GLES_COMPUTE/Helper.h
+++ b/tests/GLES_COMPUTE/Helper.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/Globals.h b/tests/Globals.h
index db79e33..9ccfd55 100644
--- a/tests/Globals.h
+++ b/tests/Globals.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/IAccessor.h b/tests/IAccessor.h
index 1a602af..c54c00e 100644
--- a/tests/IAccessor.h
+++ b/tests/IAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/IArrayAccessor.h b/tests/IArrayAccessor.h
index 4650769..488aeb0 100644
--- a/tests/IArrayAccessor.h
+++ b/tests/IArrayAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/IHOGAccessor.h b/tests/IHOGAccessor.h
index c974118..f1c137c 100644
--- a/tests/IHOGAccessor.h
+++ b/tests/IHOGAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/ILutAccessor.h b/tests/ILutAccessor.h
index 403717d..39fa202 100644
--- a/tests/ILutAccessor.h
+++ b/tests/ILutAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/NEON/Accessor.h b/tests/NEON/Accessor.h
index 9e031f4..6dc09e3 100644
--- a/tests/NEON/Accessor.h
+++ b/tests/NEON/Accessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/NEON/ArrayAccessor.h b/tests/NEON/ArrayAccessor.h
index 80f5a58..8de67ed 100644
--- a/tests/NEON/ArrayAccessor.h
+++ b/tests/NEON/ArrayAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/NEON/HOGAccessor.h b/tests/NEON/HOGAccessor.h
index 11f3f79..735abb0 100644
--- a/tests/NEON/HOGAccessor.h
+++ b/tests/NEON/HOGAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/NEON/Helper.h b/tests/NEON/Helper.h
index 18e400d..d1ae37e 100644
--- a/tests/NEON/Helper.h
+++ b/tests/NEON/Helper.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/NEON/LutAccessor.h b/tests/NEON/LutAccessor.h
index e399d5a..5204d06 100644
--- a/tests/NEON/LutAccessor.h
+++ b/tests/NEON/LutAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/PaddingCalculator.h b/tests/PaddingCalculator.h
index c282e8c..72c9236 100644
--- a/tests/PaddingCalculator.h
+++ b/tests/PaddingCalculator.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/RawLutAccessor.h b/tests/RawLutAccessor.h
index b797ee9..4318fb2 100644
--- a/tests/RawLutAccessor.h
+++ b/tests/RawLutAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/RawTensor.cpp b/tests/RawTensor.cpp
index ce2510f..a32886e 100644
--- a/tests/RawTensor.cpp
+++ b/tests/RawTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/RawTensor.h b/tests/RawTensor.h
index 48b30ef..4aac0e2 100644
--- a/tests/RawTensor.h
+++ b/tests/RawTensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/SConscript b/tests/SConscript
index ae53375..73f5ad7 100644
--- a/tests/SConscript
+++ b/tests/SConscript

@@ -1,4 +1,4 @@
-# Copyright (c) 2017-2019 ARM Limited.
+# Copyright (c) 2017-2019 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #

diff --git a/tests/SimpleTensor.h b/tests/SimpleTensor.h
index 07474ff..82a5352 100644
--- a/tests/SimpleTensor.h
+++ b/tests/SimpleTensor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/SimpleTensorAccessor.h b/tests/SimpleTensorAccessor.h
index 16f88cc..606a94f 100644
--- a/tests/SimpleTensorAccessor.h
+++ b/tests/SimpleTensorAccessor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/SimpleTensorPrinter.h b/tests/SimpleTensorPrinter.h
index 905a156..6c1506b 100644
--- a/tests/SimpleTensorPrinter.h
+++ b/tests/SimpleTensorPrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/TensorCache.h b/tests/TensorCache.h
index aa06b44..6f97946 100644
--- a/tests/TensorCache.h
+++ b/tests/TensorCache.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/TypePrinter.h b/tests/TypePrinter.h
index 67d789f..612360d 100644
--- a/tests/TypePrinter.h
+++ b/tests/TypePrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/TypeReader.h b/tests/TypeReader.h
index 8f7efd6..92f1b14 100644
--- a/tests/TypeReader.h
+++ b/tests/TypeReader.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/Types.h b/tests/Types.h
index 17858e4..c8e9a75 100644
--- a/tests/Types.h
+++ b/tests/Types.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/Utils.h b/tests/Utils.h
index 1cc3cb8..81fd253 100644
--- a/tests/Utils.h
+++ b/tests/Utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/benchmark/CL/Scale.cpp b/tests/benchmark/CL/Scale.cpp
index 0b219b4..58727ed 100644
--- a/tests/benchmark/CL/Scale.cpp
+++ b/tests/benchmark/CL/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/benchmark/GLES_COMPUTE/Scale.cpp b/tests/benchmark/GLES_COMPUTE/Scale.cpp
index b34056a..b3b2735 100644
--- a/tests/benchmark/GLES_COMPUTE/Scale.cpp
+++ b/tests/benchmark/GLES_COMPUTE/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/benchmark/NEON/Scale.cpp b/tests/benchmark/NEON/Scale.cpp
index 3a6ebcd..7da1b89 100644
--- a/tests/benchmark/NEON/Scale.cpp
+++ b/tests/benchmark/NEON/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/benchmark/fixtures/ScaleFixture.h b/tests/benchmark/fixtures/ScaleFixture.h
index b2fbd9c..1fea66f 100644
--- a/tests/benchmark/fixtures/ScaleFixture.h
+++ b/tests/benchmark/fixtures/ScaleFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -75,7 +75,7 @@
         dst = create_tensor<TensorType>(shape_scaled, data_type);
 
         // Create and configure function
-        scale_func.configure(&src, &dst, policy, border_mode, constant_border_value, sampling_policy);
+        scale_func.configure(&src, &dst, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy });
 
         // Allocate tensors
         src.allocator()->allocate();

diff --git a/tests/benchmark/fixtures/ScaleLayerFixture.h b/tests/benchmark/fixtures/ScaleLayerFixture.h
index 10568ea..f69b918 100644
--- a/tests/benchmark/fixtures/ScaleLayerFixture.h
+++ b/tests/benchmark/fixtures/ScaleLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,7 @@
         shape_scaled.set(1, shape[1] * scale_y);
         dst = create_tensor<TensorType>(shape_scaled, data_type);
 
-        scale_layer.configure(&src, &dst, policy, border_mode, constant_border_value, sampling_policy);
+        scale_layer.configure(&src, &dst, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy });
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);

diff --git a/tests/benchmark_examples/RunExample.cpp b/tests/benchmark_examples/RunExample.cpp
index 613e985..925daaf 100644
--- a/tests/benchmark_examples/RunExample.cpp
+++ b/tests/benchmark_examples/RunExample.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ActivationFunctionsDataset.h b/tests/datasets/ActivationFunctionsDataset.h
index 7854875..1f3313c 100644
--- a/tests/datasets/ActivationFunctionsDataset.h
+++ b/tests/datasets/ActivationFunctionsDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/AlexNetGEMMDataset.h b/tests/datasets/AlexNetGEMMDataset.h
index ac47fb6..cdbcb83 100644
--- a/tests/datasets/AlexNetGEMMDataset.h
+++ b/tests/datasets/AlexNetGEMMDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017, 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/BatchNormalizationLayerDataset.h b/tests/datasets/BatchNormalizationLayerDataset.h
index fceff1e..02e050c 100644
--- a/tests/datasets/BatchNormalizationLayerDataset.h
+++ b/tests/datasets/BatchNormalizationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/BatchToSpaceDataset.h b/tests/datasets/BatchToSpaceDataset.h
index 7449392..1edd457 100644
--- a/tests/datasets/BatchToSpaceDataset.h
+++ b/tests/datasets/BatchToSpaceDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/BorderModeDataset.h b/tests/datasets/BorderModeDataset.h
index 45329a3..84a7a4c 100644
--- a/tests/datasets/BorderModeDataset.h
+++ b/tests/datasets/BorderModeDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ChannelShuffleLayerDataset.h b/tests/datasets/ChannelShuffleLayerDataset.h
index a475139..afab893 100644
--- a/tests/datasets/ChannelShuffleLayerDataset.h
+++ b/tests/datasets/ChannelShuffleLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/Col2ImLayerDataset.h b/tests/datasets/Col2ImLayerDataset.h
index b39cedb..50e64e9 100644
--- a/tests/datasets/Col2ImLayerDataset.h
+++ b/tests/datasets/Col2ImLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ComparisonOperationsDataset.h b/tests/datasets/ComparisonOperationsDataset.h
index 845f4de..d015ee0 100644
--- a/tests/datasets/ComparisonOperationsDataset.h
+++ b/tests/datasets/ComparisonOperationsDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ConvertPolicyDataset.h b/tests/datasets/ConvertPolicyDataset.h
index 3b95d4a..bb30203 100644
--- a/tests/datasets/ConvertPolicyDataset.h
+++ b/tests/datasets/ConvertPolicyDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ConvolutionLayerDataset.h b/tests/datasets/ConvolutionLayerDataset.h
index 2981994..075f7f0 100644
--- a/tests/datasets/ConvolutionLayerDataset.h
+++ b/tests/datasets/ConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/CropResizeDataset.h b/tests/datasets/CropResizeDataset.h
index 8cee094..14da8bd 100644
--- a/tests/datasets/CropResizeDataset.h
+++ b/tests/datasets/CropResizeDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/DatatypeDataset.h b/tests/datasets/DatatypeDataset.h
index cc79104..4cce7bb 100644
--- a/tests/datasets/DatatypeDataset.h
+++ b/tests/datasets/DatatypeDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/DepthToSpaceDataset.h b/tests/datasets/DepthToSpaceDataset.h
index 06ee5ba..27cb6ad 100644
--- a/tests/datasets/DepthToSpaceDataset.h
+++ b/tests/datasets/DepthToSpaceDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index 014207e..5d516b5 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/DilatedConvolutionLayerDataset.h b/tests/datasets/DilatedConvolutionLayerDataset.h
index 3bbf282..fd3683c 100644
--- a/tests/datasets/DilatedConvolutionLayerDataset.h
+++ b/tests/datasets/DilatedConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h b/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h
index 8f4df3e..38762f3 100644
--- a/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DilatedDepthwiseConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/DirectConvolutionLayerDataset.h b/tests/datasets/DirectConvolutionLayerDataset.h
index 1091f9e..0dc5f30 100644
--- a/tests/datasets/DirectConvolutionLayerDataset.h
+++ b/tests/datasets/DirectConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/FullyConnectedLayerDataset.h b/tests/datasets/FullyConnectedLayerDataset.h
index 06f74ec..2e134b4 100644
--- a/tests/datasets/FullyConnectedLayerDataset.h
+++ b/tests/datasets/FullyConnectedLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/GEMMDataset.h b/tests/datasets/GEMMDataset.h
index 9491582..f18397e 100644
--- a/tests/datasets/GEMMDataset.h
+++ b/tests/datasets/GEMMDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/GEMMLowpDataset.h b/tests/datasets/GEMMLowpDataset.h
index 062c05b..608dd82 100644
--- a/tests/datasets/GEMMLowpDataset.h
+++ b/tests/datasets/GEMMLowpDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h b/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
index 3b4e81a..fe3ca25 100644
--- a/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
+++ b/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/GatherDataset.h b/tests/datasets/GatherDataset.h
index 29f2ccc..29a99d5 100644
--- a/tests/datasets/GatherDataset.h
+++ b/tests/datasets/GatherDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/GoogleNetGEMMDataset.h b/tests/datasets/GoogleNetGEMMDataset.h
index 50e9b45..f5f09ae 100644
--- a/tests/datasets/GoogleNetGEMMDataset.h
+++ b/tests/datasets/GoogleNetGEMMDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/GradientDimensionDataset.h b/tests/datasets/GradientDimensionDataset.h
index b85d43f..3bbf00a 100644
--- a/tests/datasets/GradientDimensionDataset.h
+++ b/tests/datasets/GradientDimensionDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/HOGDescriptorDataset.h b/tests/datasets/HOGDescriptorDataset.h
index 73c6494..92d3506 100644
--- a/tests/datasets/HOGDescriptorDataset.h
+++ b/tests/datasets/HOGDescriptorDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017, 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/HOGMultiDetectionDataset.h b/tests/datasets/HOGMultiDetectionDataset.h
index eb493d0..042617e 100644
--- a/tests/datasets/HOGMultiDetectionDataset.h
+++ b/tests/datasets/HOGMultiDetectionDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ImageFileDatasets.h b/tests/datasets/ImageFileDatasets.h
index 2494f24..dc7975f 100644
--- a/tests/datasets/ImageFileDatasets.h
+++ b/tests/datasets/ImageFileDatasets.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/InterpolationPolicyDataset.h b/tests/datasets/InterpolationPolicyDataset.h
index bb14b85..4d91e4c 100644
--- a/tests/datasets/InterpolationPolicyDataset.h
+++ b/tests/datasets/InterpolationPolicyDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/LSTMLayerDataset.h b/tests/datasets/LSTMLayerDataset.h
index c21f320..6ea2222 100644
--- a/tests/datasets/LSTMLayerDataset.h
+++ b/tests/datasets/LSTMLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h
index 20a73b8..1cffc9a 100644
--- a/tests/datasets/LargeConvolutionLayerDataset.h
+++ b/tests/datasets/LargeConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/LargeGEMMDataset.h b/tests/datasets/LargeGEMMDataset.h
index 0ca0b04..6cdff7f 100644
--- a/tests/datasets/LargeGEMMDataset.h
+++ b/tests/datasets/LargeGEMMDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/LargeGEMMLowpDataset.h b/tests/datasets/LargeGEMMLowpDataset.h
index 65cb742..6c201c5 100644
--- a/tests/datasets/LargeGEMMLowpDataset.h
+++ b/tests/datasets/LargeGEMMLowpDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/LocallyConnectedDataset.h b/tests/datasets/LocallyConnectedDataset.h
index cc2fa88..5d2017d 100644
--- a/tests/datasets/LocallyConnectedDataset.h
+++ b/tests/datasets/LocallyConnectedDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/MatrixMultiplyGEMMDataset.h b/tests/datasets/MatrixMultiplyGEMMDataset.h
index fd2a3d6..66adfa6 100644
--- a/tests/datasets/MatrixMultiplyGEMMDataset.h
+++ b/tests/datasets/MatrixMultiplyGEMMDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/MatrixPatternDataset.h b/tests/datasets/MatrixPatternDataset.h
index ebfdeac..e422051 100644
--- a/tests/datasets/MatrixPatternDataset.h
+++ b/tests/datasets/MatrixPatternDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/NonLinearFilterFunctionDataset.h b/tests/datasets/NonLinearFilterFunctionDataset.h
index 458b343..1253590 100644
--- a/tests/datasets/NonLinearFilterFunctionDataset.h
+++ b/tests/datasets/NonLinearFilterFunctionDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/NormalizationTypesDataset.h b/tests/datasets/NormalizationTypesDataset.h
index 58e6f67..5e777c5 100644
--- a/tests/datasets/NormalizationTypesDataset.h
+++ b/tests/datasets/NormalizationTypesDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/NormalizePlanarYUVLayerDataset.h b/tests/datasets/NormalizePlanarYUVLayerDataset.h
index 1a97e68..1d7b320 100644
--- a/tests/datasets/NormalizePlanarYUVLayerDataset.h
+++ b/tests/datasets/NormalizePlanarYUVLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/OpticalFlowDataset.h b/tests/datasets/OpticalFlowDataset.h
index 00b1487..2ef75af 100644
--- a/tests/datasets/OpticalFlowDataset.h
+++ b/tests/datasets/OpticalFlowDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/PoolingLayerDataset.h b/tests/datasets/PoolingLayerDataset.h
index 943279e..01b2491 100644
--- a/tests/datasets/PoolingLayerDataset.h
+++ b/tests/datasets/PoolingLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/PoolingTypesDataset.h b/tests/datasets/PoolingTypesDataset.h
index 67560f7..3a47a35 100644
--- a/tests/datasets/PoolingTypesDataset.h
+++ b/tests/datasets/PoolingTypesDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/PriorBoxLayerDataset.h b/tests/datasets/PriorBoxLayerDataset.h
index a2392db..b5c8a44 100644
--- a/tests/datasets/PriorBoxLayerDataset.h
+++ b/tests/datasets/PriorBoxLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/RNNLayerDataset.h b/tests/datasets/RNNLayerDataset.h
index 5f42def..5c252f9 100644
--- a/tests/datasets/RNNLayerDataset.h
+++ b/tests/datasets/RNNLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ROIDataset.h b/tests/datasets/ROIDataset.h
index 9e21ab1..2db8973 100644
--- a/tests/datasets/ROIDataset.h
+++ b/tests/datasets/ROIDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/RandomBatchNormalizationLayerDataset.h b/tests/datasets/RandomBatchNormalizationLayerDataset.h
index 2acce6e..5a49dd7 100644
--- a/tests/datasets/RandomBatchNormalizationLayerDataset.h
+++ b/tests/datasets/RandomBatchNormalizationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/RandomNormalizePlanarYUVLayerDataset.h b/tests/datasets/RandomNormalizePlanarYUVLayerDataset.h
index 56eb604..d42c682 100644
--- a/tests/datasets/RandomNormalizePlanarYUVLayerDataset.h
+++ b/tests/datasets/RandomNormalizePlanarYUVLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ReorgLayerDataset.h b/tests/datasets/ReorgLayerDataset.h
index de363e7..34d2701 100644
--- a/tests/datasets/ReorgLayerDataset.h
+++ b/tests/datasets/ReorgLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ReshapeLayerDataset.h b/tests/datasets/ReshapeLayerDataset.h
index 1a8932f..d1a1667 100644
--- a/tests/datasets/ReshapeLayerDataset.h
+++ b/tests/datasets/ReshapeLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/SamplingPolicyDataset.h b/tests/datasets/SamplingPolicyDataset.h
index 6341f52..e36eb51 100644
--- a/tests/datasets/SamplingPolicyDataset.h
+++ b/tests/datasets/SamplingPolicyDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ScaleLayerDataset.h b/tests/datasets/ScaleLayerDataset.h
index 8b5fbbe..810ea5c 100644
--- a/tests/datasets/ScaleLayerDataset.h
+++ b/tests/datasets/ScaleLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ScaleValidationDataset.h b/tests/datasets/ScaleValidationDataset.h
new file mode 100644
index 0000000..217876b
--- /dev/null
+++ b/tests/datasets/ScaleValidationDataset.h

@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_SCALE_VALIDATION_DATASET
+#define ARM_COMPUTE_TEST_SCALE_VALIDATION_DATASET
+
+#include "utils/TypePrinter.h"
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/datasets/BorderModeDataset.h"
+#include "tests/datasets/InterpolationPolicyDataset.h"
+#include "tests/datasets/SamplingPolicyDataset.h"
+#include "tests/datasets/ShapeDatasets.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace datasets
+{
+/** Class to generate boundary values for the given template parameters
+ * including shapes with large differences between width and height.
+ * element_per_iteration is the number of elements processed by one iteration
+ * of an implementation. (E.g., if an iteration is based on a 16-byte vector
+ * and size of one element is 1-byte, this value would be 16.).
+ * iterations is the total number of complete iterations we want to test
+ * for the effect of larger shapes.
+ */
+template <uint32_t channel, uint32_t batch, uint32_t element_per_iteration, uint32_t iterations>
+class ScaleShapesBaseDataSet : public ShapeDataset
+{
+    static constexpr auto boundary_minus_one = element_per_iteration * iterations - 1;
+    static constexpr auto boundary_plus_one  = element_per_iteration * iterations + 1;
+    static constexpr auto small_size         = 3;
+
+public:
+    // These tensor shapes are NCHW layout, fixture will convert to NHWC.
+    ScaleShapesBaseDataSet()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ small_size, boundary_minus_one, channel, batch },
+                     TensorShape{ small_size, boundary_plus_one, channel, batch },
+                     TensorShape{ boundary_minus_one, small_size, channel, batch },
+                     TensorShape{ boundary_plus_one, small_size, channel, batch },
+                     TensorShape{ boundary_minus_one, boundary_plus_one, channel, batch },
+                     TensorShape{ boundary_plus_one, boundary_minus_one, channel, batch },
+    })
+    {
+    }
+};
+
+/** For the single vector, only larger value (+1) than boundary
+ * since smaller value (-1) could cause some invalid shapes like
+ * - invalid zero size
+ * - size 1 which isn't compatible with scale with aligned corners.
+ */
+template <uint32_t channel, uint32_t batch, uint32_t element_per_iteration>
+class ScaleShapesBaseDataSet<channel, batch, element_per_iteration, 1> : public ShapeDataset
+{
+    static constexpr auto small_size        = 3;
+    static constexpr auto boundary_plus_one = element_per_iteration + 1;
+
+public:
+    // These tensor shapes are NCHW layout, fixture will convert to NHWC.
+    ScaleShapesBaseDataSet()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ small_size, boundary_plus_one, channel, batch },
+                     TensorShape{ boundary_plus_one, small_size, channel, batch },
+    })
+    {
+    }
+};
+
+/** For the shapes smaller than one vector, only pre-defined tiny shapes
+ * are tested (3x2, 2x3) as smaller shapes are more likely to cause
+ * issues and easier to debug.
+ */
+template <uint32_t channel, uint32_t batch, uint32_t element_per_iteration>
+class ScaleShapesBaseDataSet<channel, batch, element_per_iteration, 0> : public ShapeDataset
+{
+    static constexpr auto small_size                 = 3;
+    static constexpr auto zero_vector_boundary_value = 2;
+
+public:
+    // These tensor shapes are NCHW layout, fixture will convert to NHWC.
+    ScaleShapesBaseDataSet()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ small_size, zero_vector_boundary_value, channel, batch },
+                     TensorShape{ zero_vector_boundary_value, small_size, channel, batch },
+    })
+    {
+    }
+};
+
+/** Interpolation policy test set */
+const auto ScaleInterpolationPolicySet = framework::dataset::make("InterpolationPolicy",
+{
+    InterpolationPolicy::NEAREST_NEIGHBOR,
+    InterpolationPolicy::BILINEAR,
+});
+
+/** Scale data types */
+const auto ScaleDataLayouts = framework::dataset::make("DataLayout",
+{
+    DataLayout::NCHW,
+    DataLayout::NHWC,
+});
+
+/** Sampling policy data set */
+const auto ScaleSamplingPolicySet = combine(datasets::SamplingPolicies(),
+                                            framework::dataset::make("AlignCorners", { false }));
+
+/** Sampling policy data set for Aligned Corners which only allows TOP_LEFT policy.*/
+const auto ScaleAlignCornersSamplingPolicySet = combine(framework::dataset::make("SamplingPolicy",
+{
+    SamplingPolicy::TOP_LEFT,
+}),
+framework::dataset::make("AlignCorners", { true }));
+
+/** Generated shapes: Used by NEON precommit and nightly
+ * - 2D shapes with 0, 1, 2 vector iterations
+ * - 3D shapes with 0, 1 vector iterations
+ * - 4D shapes with 0 vector iterations
+ */
+#define SCALE_SHAPE_DATASET(element_per_iteration)                                                  \
+    concat(concat(concat(concat(concat(ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 0>(),  \
+                                       ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 1>()), \
+                                ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 2>()),        \
+                         ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 0>()),               \
+                  ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 1>()),                      \
+           ScaleShapesBaseDataSet<3, 3, (element_per_iteration), 0>())
+
+// To prevent long precommit time for OpenCL, shape set for OpenCL is separated into below two parts.
+/** Generated shapes for precommits to achieve essential coverage. Used by CL precommit and nightly
+ * - 3D shapes with 1 vector iterations
+ * - 4D shapes with 1 vector iterations
+ */
+#define SCALE_PRECOMMIT_SHAPE_DATASET(element_per_iteration) \
+    concat(ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 1>(), ScaleShapesBaseDataSet<3, 3, (element_per_iteration), 1>())
+
+/** Generated shapes for nightly to achieve more small and variety shapes. Used by CL nightly
+ * - 2D shapes with 0, 1, 2 vector iterations
+ * - 3D shapes with 0 vector iterations (1 vector iteration is covered by SCALE_PRECOMMIT_SHAPE_DATASET)
+ * - 4D shapes with 0 vector iterations
+ */
+#define SCALE_NIGHTLY_SHAPE_DATASET(element_per_iteration)                                   \
+    concat(concat(concat(concat(ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 0>(),  \
+                                ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 1>()), \
+                         ScaleShapesBaseDataSet<1, 1, (element_per_iteration), 2>()),        \
+                  ScaleShapesBaseDataSet<3, 1, (element_per_iteration), 0>()),               \
+           ScaleShapesBaseDataSet<3, 3, (element_per_iteration), 0>())
+
+/** Generating dataset for non-quantized data tyeps with the given shapes */
+#define ASSEMBLE_DATASET(shape, samping_policy_set)             \
+    combine(combine(combine(combine((shape), ScaleDataLayouts), \
+                            ScaleInterpolationPolicySet),       \
+                    datasets::BorderModes()),                   \
+            samping_policy_set)
+
+/** Generating dataset for quantized data tyeps with the given shapes */
+#define ASSEMBLE_QUANTIZED_DATASET(shape, sampling_policy_set, quantization_info_set) \
+    combine(combine(combine(combine(combine(shape,                                    \
+                                            quantization_info_set),                   \
+                                    ScaleDataLayouts),                                \
+                            ScaleInterpolationPolicySet),                             \
+                    datasets::BorderModes()),                                         \
+            sampling_policy_set)
+
+} // namespace datasets
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_SCALE_VALIDATION_DATASET */
\ No newline at end of file

diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 087342d..ccd0756 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/SliceOperationsDataset.h b/tests/datasets/SliceOperationsDataset.h
index e891419..be0b808 100644
--- a/tests/datasets/SliceOperationsDataset.h
+++ b/tests/datasets/SliceOperationsDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index e426b28..66640dd 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/SmallGEMMDataset.h b/tests/datasets/SmallGEMMDataset.h
index 45d1a1e..7d2b42a 100644
--- a/tests/datasets/SmallGEMMDataset.h
+++ b/tests/datasets/SmallGEMMDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/SmallGEMMLowpDataset.h b/tests/datasets/SmallGEMMLowpDataset.h
index 40f0c71..f16e3fa 100644
--- a/tests/datasets/SmallGEMMLowpDataset.h
+++ b/tests/datasets/SmallGEMMLowpDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/SpaceToBatchDataset.h b/tests/datasets/SpaceToBatchDataset.h
index 37b0f2e..650dc90 100644
--- a/tests/datasets/SpaceToBatchDataset.h
+++ b/tests/datasets/SpaceToBatchDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/SpaceToDepthDataset.h b/tests/datasets/SpaceToDepthDataset.h
index eb81c9a..414dce0 100644
--- a/tests/datasets/SpaceToDepthDataset.h
+++ b/tests/datasets/SpaceToDepthDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/SplitDataset.h b/tests/datasets/SplitDataset.h
index 3d4c289..a1eb4d7 100644
--- a/tests/datasets/SplitDataset.h
+++ b/tests/datasets/SplitDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/ThresholdDataset.h b/tests/datasets/ThresholdDataset.h
index 4ae2017..a082ab3 100644
--- a/tests/datasets/ThresholdDataset.h
+++ b/tests/datasets/ThresholdDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/TinyConvolutionLayerDataset.h b/tests/datasets/TinyConvolutionLayerDataset.h
index 178ebd3..33542cc 100644
--- a/tests/datasets/TinyConvolutionLayerDataset.h
+++ b/tests/datasets/TinyConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/TinyGEMMDataset.h b/tests/datasets/TinyGEMMDataset.h
index 83af8f0..6cb7efa 100644
--- a/tests/datasets/TinyGEMMDataset.h
+++ b/tests/datasets/TinyGEMMDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/WinogradInputTransformDataset.h b/tests/datasets/WinogradInputTransformDataset.h
index ac9baba..fc5593e 100644
--- a/tests/datasets/WinogradInputTransformDataset.h
+++ b/tests/datasets/WinogradInputTransformDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/WinogradOutputTransformDataset.h b/tests/datasets/WinogradOutputTransformDataset.h
index d15a16e..e666db4 100644
--- a/tests/datasets/WinogradOutputTransformDataset.h
+++ b/tests/datasets/WinogradOutputTransformDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/alexnet/AlexNetActivationLayerDataset.h b/tests/datasets/system_tests/alexnet/AlexNetActivationLayerDataset.h
index 74c5fda..16cd1f3 100644
--- a/tests/datasets/system_tests/alexnet/AlexNetActivationLayerDataset.h
+++ b/tests/datasets/system_tests/alexnet/AlexNetActivationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/alexnet/AlexNetConvolutionLayerDataset.h b/tests/datasets/system_tests/alexnet/AlexNetConvolutionLayerDataset.h
index 034e1bb..dc31f60 100644
--- a/tests/datasets/system_tests/alexnet/AlexNetConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/alexnet/AlexNetConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/alexnet/AlexNetFullyConnectedLayerDataset.h b/tests/datasets/system_tests/alexnet/AlexNetFullyConnectedLayerDataset.h
index 3d43e6f..b4c367b 100644
--- a/tests/datasets/system_tests/alexnet/AlexNetFullyConnectedLayerDataset.h
+++ b/tests/datasets/system_tests/alexnet/AlexNetFullyConnectedLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/alexnet/AlexNetNormalizationLayerDataset.h b/tests/datasets/system_tests/alexnet/AlexNetNormalizationLayerDataset.h
index 6f68fd7..62c9855 100644
--- a/tests/datasets/system_tests/alexnet/AlexNetNormalizationLayerDataset.h
+++ b/tests/datasets/system_tests/alexnet/AlexNetNormalizationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/alexnet/AlexNetPoolingLayerDataset.h b/tests/datasets/system_tests/alexnet/AlexNetPoolingLayerDataset.h
index 241f256..4874ce9 100644
--- a/tests/datasets/system_tests/alexnet/AlexNetPoolingLayerDataset.h
+++ b/tests/datasets/system_tests/alexnet/AlexNetPoolingLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ActivationLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ActivationLayerDataset.h
index 76087fa..cb72b25 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ActivationLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ActivationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ConvolutionLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ConvolutionLayerDataset.h
index 191452c..5588c09 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1ConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1FullyConnectedLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1FullyConnectedLayerDataset.h
index cde9ae6..f968e17 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1FullyConnectedLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1FullyConnectedLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1GEMMDataset.h b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1GEMMDataset.h
index 3b5c9ac..6292648 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1GEMMDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1GEMMDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1NormalizationLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1NormalizationLayerDataset.h
index eb6903f..c572690 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1NormalizationLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1NormalizationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1PoolingLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1PoolingLayerDataset.h
index 652ac6b..8a2dc0c 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1PoolingLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv1/GoogLeNetInceptionV1PoolingLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4ActivationLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4ActivationLayerDataset.h
index 395b1f9..2c082eb 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4ActivationLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4ActivationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4BatchNormalizationLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4BatchNormalizationLayerDataset.h
index 3a22c78..dfc009b 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4BatchNormalizationLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4BatchNormalizationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4ConvolutionLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4ConvolutionLayerDataset.h
index 03a8629..6480448 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4ConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4ConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4FullyConnectedLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4FullyConnectedLayerDataset.h
index 5a0af2b..290447d 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4FullyConnectedLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4FullyConnectedLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4NormalizePlanarYUVLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4NormalizePlanarYUVLayerDataset.h
index e0da484..cb8ebe3 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4NormalizePlanarYUVLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4NormalizePlanarYUVLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4PoolingLayerDataset.h b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4PoolingLayerDataset.h
index 12eefdd..d639b9a 100644
--- a/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4PoolingLayerDataset.h
+++ b/tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4PoolingLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/lenet5/LeNet5ActivationLayerDataset.h b/tests/datasets/system_tests/lenet5/LeNet5ActivationLayerDataset.h
index 021bfee..b587b00 100644
--- a/tests/datasets/system_tests/lenet5/LeNet5ActivationLayerDataset.h
+++ b/tests/datasets/system_tests/lenet5/LeNet5ActivationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/lenet5/LeNet5ConvolutionLayerDataset.h b/tests/datasets/system_tests/lenet5/LeNet5ConvolutionLayerDataset.h
index 3f28627..06d9fc7 100644
--- a/tests/datasets/system_tests/lenet5/LeNet5ConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/lenet5/LeNet5ConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/lenet5/LeNet5FullyConnectedLayerDataset.h b/tests/datasets/system_tests/lenet5/LeNet5FullyConnectedLayerDataset.h
index 74e8d2c..0724558 100644
--- a/tests/datasets/system_tests/lenet5/LeNet5FullyConnectedLayerDataset.h
+++ b/tests/datasets/system_tests/lenet5/LeNet5FullyConnectedLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/lenet5/LeNet5PoolingLayerDataset.h b/tests/datasets/system_tests/lenet5/LeNet5PoolingLayerDataset.h
index 5740dc3..713fa9e 100644
--- a/tests/datasets/system_tests/lenet5/LeNet5PoolingLayerDataset.h
+++ b/tests/datasets/system_tests/lenet5/LeNet5PoolingLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/mobilenet/MobileNetActivationLayerDataset.h b/tests/datasets/system_tests/mobilenet/MobileNetActivationLayerDataset.h
index 64cd918..90cc755 100644
--- a/tests/datasets/system_tests/mobilenet/MobileNetActivationLayerDataset.h
+++ b/tests/datasets/system_tests/mobilenet/MobileNetActivationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/mobilenet/MobileNetBatchNormalizationLayerDataset.h b/tests/datasets/system_tests/mobilenet/MobileNetBatchNormalizationLayerDataset.h
index d09ff02..89945d6 100644
--- a/tests/datasets/system_tests/mobilenet/MobileNetBatchNormalizationLayerDataset.h
+++ b/tests/datasets/system_tests/mobilenet/MobileNetBatchNormalizationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/mobilenet/MobileNetConvolutionLayerDataset.h b/tests/datasets/system_tests/mobilenet/MobileNetConvolutionLayerDataset.h
index b108fe6..cd229e8 100644
--- a/tests/datasets/system_tests/mobilenet/MobileNetConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/mobilenet/MobileNetConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/mobilenet/MobileNetDepthwiseConvolutionLayerDataset.h b/tests/datasets/system_tests/mobilenet/MobileNetDepthwiseConvolutionLayerDataset.h
index 25ac1c1..8ef91ce 100644
--- a/tests/datasets/system_tests/mobilenet/MobileNetDepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/mobilenet/MobileNetDepthwiseConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/resnet12/ResNet12ConvolutionLayerDataset.h b/tests/datasets/system_tests/resnet12/ResNet12ConvolutionLayerDataset.h
index b960dce..9c595bc 100644
--- a/tests/datasets/system_tests/resnet12/ResNet12ConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/resnet12/ResNet12ConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/squeezenet/SqueezeNetActivationLayerDataset.h b/tests/datasets/system_tests/squeezenet/SqueezeNetActivationLayerDataset.h
index 7f4bf4d..6fdd791 100644
--- a/tests/datasets/system_tests/squeezenet/SqueezeNetActivationLayerDataset.h
+++ b/tests/datasets/system_tests/squeezenet/SqueezeNetActivationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/squeezenet/SqueezeNetConvolutionLayerDataset.h b/tests/datasets/system_tests/squeezenet/SqueezeNetConvolutionLayerDataset.h
index f98d90a..867d199 100644
--- a/tests/datasets/system_tests/squeezenet/SqueezeNetConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/squeezenet/SqueezeNetConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/squeezenet/SqueezeNetPoolingLayerDataset.h b/tests/datasets/system_tests/squeezenet/SqueezeNetPoolingLayerDataset.h
index b1283cd..54b01bd 100644
--- a/tests/datasets/system_tests/squeezenet/SqueezeNetPoolingLayerDataset.h
+++ b/tests/datasets/system_tests/squeezenet/SqueezeNetPoolingLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/vgg/vgg16/VGG16ActivationLayerDataset.h b/tests/datasets/system_tests/vgg/vgg16/VGG16ActivationLayerDataset.h
index 93de996..0b2433f 100644
--- a/tests/datasets/system_tests/vgg/vgg16/VGG16ActivationLayerDataset.h
+++ b/tests/datasets/system_tests/vgg/vgg16/VGG16ActivationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/vgg/vgg16/VGG16ConvolutionLayerDataset.h b/tests/datasets/system_tests/vgg/vgg16/VGG16ConvolutionLayerDataset.h
index fe31da2..234beb1 100644
--- a/tests/datasets/system_tests/vgg/vgg16/VGG16ConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/vgg/vgg16/VGG16ConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/vgg/vgg16/VGG16FullyConnectedLayerDataset.h b/tests/datasets/system_tests/vgg/vgg16/VGG16FullyConnectedLayerDataset.h
index 3b40b34..16213d0 100644
--- a/tests/datasets/system_tests/vgg/vgg16/VGG16FullyConnectedLayerDataset.h
+++ b/tests/datasets/system_tests/vgg/vgg16/VGG16FullyConnectedLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/vgg/vgg16/VGG16PoolingLayerDataset.h b/tests/datasets/system_tests/vgg/vgg16/VGG16PoolingLayerDataset.h
index 35221dd..a1ebfaa 100644
--- a/tests/datasets/system_tests/vgg/vgg16/VGG16PoolingLayerDataset.h
+++ b/tests/datasets/system_tests/vgg/vgg16/VGG16PoolingLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/yolo/v2/YOLOV2ActivationLayerDataset.h b/tests/datasets/system_tests/yolo/v2/YOLOV2ActivationLayerDataset.h
index f30069e..ec6f470 100644
--- a/tests/datasets/system_tests/yolo/v2/YOLOV2ActivationLayerDataset.h
+++ b/tests/datasets/system_tests/yolo/v2/YOLOV2ActivationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/yolo/v2/YOLOV2BatchNormalizationLayerDataset.h b/tests/datasets/system_tests/yolo/v2/YOLOV2BatchNormalizationLayerDataset.h
index 4d2cba8..9997ffc 100644
--- a/tests/datasets/system_tests/yolo/v2/YOLOV2BatchNormalizationLayerDataset.h
+++ b/tests/datasets/system_tests/yolo/v2/YOLOV2BatchNormalizationLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/yolo/v2/YOLOV2ConvolutionLayerDataset.h b/tests/datasets/system_tests/yolo/v2/YOLOV2ConvolutionLayerDataset.h
index 5e07ff8..67d57bf 100644
--- a/tests/datasets/system_tests/yolo/v2/YOLOV2ConvolutionLayerDataset.h
+++ b/tests/datasets/system_tests/yolo/v2/YOLOV2ConvolutionLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/datasets/system_tests/yolo/v2/YOLOV2PoolingLayerDataset.h b/tests/datasets/system_tests/yolo/v2/YOLOV2PoolingLayerDataset.h
index ddbad6b..88cac5e 100644
--- a/tests/datasets/system_tests/yolo/v2/YOLOV2PoolingLayerDataset.h
+++ b/tests/datasets/system_tests/yolo/v2/YOLOV2PoolingLayerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Asserts.h b/tests/framework/Asserts.h
index 9d6d4fa..b8a8fe0 100644
--- a/tests/framework/Asserts.h
+++ b/tests/framework/Asserts.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -135,6 +135,47 @@
         arm_compute::test::framework::Framework::get().clear_test_info();                                                                     \
     } while(false)
 
+#define ARM_COMPUTE_EXPECT_NO_THROW(X, LEVEL)                                                                                                 \
+    do                                                                                                                                        \
+    {                                                                                                                                         \
+        try                                                                                                                                   \
+        {                                                                                                                                     \
+            const auto &x = X;                                                                                                                \
+            (void)x;                                                                                                                          \
+        }                                                                                                                                     \
+        catch(...)                                                                                                                            \
+        {                                                                                                                                     \
+            std::stringstream msg;                                                                                                            \
+            msg << "Expectation '" #X "' to not throw failed.\n";                                                                             \
+            arm_compute::test::framework::Framework::get().print_test_info(msg);                                                              \
+            arm_compute::test::framework::Framework::get().log_failed_expectation(arm_compute::test::framework::TestError(msg.str(), LEVEL)); \
+        }                                                                                                                                     \
+        arm_compute::test::framework::Framework::get().clear_test_info();                                                                     \
+    } while(false)
+
+#define ARM_COMPUTE_EXPECT_THROW(X, LEVEL)                                                                                                    \
+    do                                                                                                                                        \
+    {                                                                                                                                         \
+        bool exception_caught = false;                                                                                                        \
+        try                                                                                                                                   \
+        {                                                                                                                                     \
+            const auto &x = X;                                                                                                                \
+            (void)x;                                                                                                                          \
+        }                                                                                                                                     \
+        catch(...)                                                                                                                            \
+        {                                                                                                                                     \
+            exception_caught = true;                                                                                                          \
+        }                                                                                                                                     \
+        if(!exception_caught)                                                                                                                 \
+        {                                                                                                                                     \
+            std::stringstream msg;                                                                                                            \
+            msg << "Expectation '" #X "' to throw failed.\n";                                                                                 \
+            arm_compute::test::framework::Framework::get().print_test_info(msg);                                                              \
+            arm_compute::test::framework::Framework::get().log_failed_expectation(arm_compute::test::framework::TestError(msg.str(), LEVEL)); \
+        }                                                                                                                                     \
+        arm_compute::test::framework::Framework::get().clear_test_info();                                                                     \
+    } while(false)
+
 #define ARM_COMPUTE_ASSERT_FAIL(MSG)                                                                              \
     do                                                                                                            \
     {                                                                                                             \

diff --git a/tests/framework/DatasetModes.cpp b/tests/framework/DatasetModes.cpp
index 0224bd8..5acc792 100644
--- a/tests/framework/DatasetModes.cpp
+++ b/tests/framework/DatasetModes.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/DatasetModes.h b/tests/framework/DatasetModes.h
index c970550..19a3cb4 100644
--- a/tests/framework/DatasetModes.h
+++ b/tests/framework/DatasetModes.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Exceptions.cpp b/tests/framework/Exceptions.cpp
index 0ca86a8..5493612 100644
--- a/tests/framework/Exceptions.cpp
+++ b/tests/framework/Exceptions.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Exceptions.h b/tests/framework/Exceptions.h
index 687305b..75de683 100644
--- a/tests/framework/Exceptions.h
+++ b/tests/framework/Exceptions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Fixture.h b/tests/framework/Fixture.h
index 916dcc7..7692bca 100644
--- a/tests/framework/Fixture.h
+++ b/tests/framework/Fixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Framework.cpp b/tests/framework/Framework.cpp
index dff280d..8e836ee 100644
--- a/tests/framework/Framework.cpp
+++ b/tests/framework/Framework.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Framework.h b/tests/framework/Framework.h
index 11dedfe..01ab373 100644
--- a/tests/framework/Framework.h
+++ b/tests/framework/Framework.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Macros.h b/tests/framework/Macros.h
index 275ea88..a67a759 100644
--- a/tests/framework/Macros.h
+++ b/tests/framework/Macros.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/ParametersLibrary.cpp b/tests/framework/ParametersLibrary.cpp
index 4af4179..a6953ea 100644
--- a/tests/framework/ParametersLibrary.cpp
+++ b/tests/framework/ParametersLibrary.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/ParametersLibrary.h b/tests/framework/ParametersLibrary.h
index 9b325be..064b5ed 100644
--- a/tests/framework/ParametersLibrary.h
+++ b/tests/framework/ParametersLibrary.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Profiler.cpp b/tests/framework/Profiler.cpp
index 7b95279..b527eb4 100644
--- a/tests/framework/Profiler.cpp
+++ b/tests/framework/Profiler.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Profiler.h b/tests/framework/Profiler.h
index 34c5224..588276f 100644
--- a/tests/framework/Profiler.h
+++ b/tests/framework/Profiler.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Registrars.h b/tests/framework/Registrars.h
index ca23edf..c22177c 100644
--- a/tests/framework/Registrars.h
+++ b/tests/framework/Registrars.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/SConscript b/tests/framework/SConscript
index b8574bd..0dae756 100644
--- a/tests/framework/SConscript
+++ b/tests/framework/SConscript

@@ -1,4 +1,4 @@
-# Copyright (c) 2017 ARM Limited.
+# Copyright (c) 2017 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #

diff --git a/tests/framework/TestCase.h b/tests/framework/TestCase.h
index d7bf54d..7424fe7 100644
--- a/tests/framework/TestCase.h
+++ b/tests/framework/TestCase.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/TestCaseFactory.h b/tests/framework/TestCaseFactory.h
index afd881e..97ba230 100644
--- a/tests/framework/TestCaseFactory.h
+++ b/tests/framework/TestCaseFactory.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/TestFilter.cpp b/tests/framework/TestFilter.cpp
index 279f019..f3127f3 100644
--- a/tests/framework/TestFilter.cpp
+++ b/tests/framework/TestFilter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,7 @@
     {
         bool found = false;
 
-        for(const auto range : _id_filter)
+        for(const auto &range : _id_filter)
         {
             if(range.first <= info.id && info.id <= range.second)
             {

diff --git a/tests/framework/TestFilter.h b/tests/framework/TestFilter.h
index f64e73a..97bce7a 100644
--- a/tests/framework/TestFilter.h
+++ b/tests/framework/TestFilter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/TestResult.h b/tests/framework/TestResult.h
index cdace17..10f10c1 100644
--- a/tests/framework/TestResult.h
+++ b/tests/framework/TestResult.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Utils.cpp b/tests/framework/Utils.cpp
index 589d274..1e25893 100644
--- a/tests/framework/Utils.cpp
+++ b/tests/framework/Utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/Utils.h b/tests/framework/Utils.h
index 3f1d0ea..58d4e62 100644
--- a/tests/framework/Utils.h
+++ b/tests/framework/Utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/command_line/CommonOptions.cpp b/tests/framework/command_line/CommonOptions.cpp
index fee18f6..b4bf58b 100644
--- a/tests/framework/command_line/CommonOptions.cpp
+++ b/tests/framework/command_line/CommonOptions.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/command_line/CommonOptions.h b/tests/framework/command_line/CommonOptions.h
index f4646a0..e332d5f 100644
--- a/tests/framework/command_line/CommonOptions.h
+++ b/tests/framework/command_line/CommonOptions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/datasets/CartesianProductDataset.h b/tests/framework/datasets/CartesianProductDataset.h
index b2790d7..19ac4f6 100644
--- a/tests/framework/datasets/CartesianProductDataset.h
+++ b/tests/framework/datasets/CartesianProductDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/datasets/ContainerDataset.h b/tests/framework/datasets/ContainerDataset.h
index bd63fb8..3987e8f 100644
--- a/tests/framework/datasets/ContainerDataset.h
+++ b/tests/framework/datasets/ContainerDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/datasets/Dataset.h b/tests/framework/datasets/Dataset.h
index 5fcdc49..aea3e09 100644
--- a/tests/framework/datasets/Dataset.h
+++ b/tests/framework/datasets/Dataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/datasets/Datasets.h b/tests/framework/datasets/Datasets.h
index c0e5822..d967f8a 100644
--- a/tests/framework/datasets/Datasets.h
+++ b/tests/framework/datasets/Datasets.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/datasets/InitializerListDataset.h b/tests/framework/datasets/InitializerListDataset.h
index ec1550d..87aae5f 100644
--- a/tests/framework/datasets/InitializerListDataset.h
+++ b/tests/framework/datasets/InitializerListDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/datasets/JoinDataset.h b/tests/framework/datasets/JoinDataset.h
index bf504ec..d651ac5 100644
--- a/tests/framework/datasets/JoinDataset.h
+++ b/tests/framework/datasets/JoinDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/datasets/RangeDataset.h b/tests/framework/datasets/RangeDataset.h
index 1adf183..7935b4c 100644
--- a/tests/framework/datasets/RangeDataset.h
+++ b/tests/framework/datasets/RangeDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/datasets/SingletonDataset.h b/tests/framework/datasets/SingletonDataset.h
index e0653b7..a9b7187 100644
--- a/tests/framework/datasets/SingletonDataset.h
+++ b/tests/framework/datasets/SingletonDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/datasets/ZipDataset.h b/tests/framework/datasets/ZipDataset.h
index 3d93b92..ce1bb37 100644
--- a/tests/framework/datasets/ZipDataset.h
+++ b/tests/framework/datasets/ZipDataset.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/Instrument.h b/tests/framework/instruments/Instrument.h
index 301ca2f..4506460 100644
--- a/tests/framework/instruments/Instrument.h
+++ b/tests/framework/instruments/Instrument.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/Instruments.cpp b/tests/framework/instruments/Instruments.cpp
index 2288124..7834fea 100644
--- a/tests/framework/instruments/Instruments.cpp
+++ b/tests/framework/instruments/Instruments.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/Instruments.h b/tests/framework/instruments/Instruments.h
index 8adf501..135810c 100644
--- a/tests/framework/instruments/Instruments.h
+++ b/tests/framework/instruments/Instruments.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/InstrumentsStats.cpp b/tests/framework/instruments/InstrumentsStats.cpp
index 8f7d8a1..2bb9eed 100644
--- a/tests/framework/instruments/InstrumentsStats.cpp
+++ b/tests/framework/instruments/InstrumentsStats.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/InstrumentsStats.h b/tests/framework/instruments/InstrumentsStats.h
index f1085aa..aa2008a 100644
--- a/tests/framework/instruments/InstrumentsStats.h
+++ b/tests/framework/instruments/InstrumentsStats.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/MaliCounter.cpp b/tests/framework/instruments/MaliCounter.cpp
index 354c899..a7557fc 100644
--- a/tests/framework/instruments/MaliCounter.cpp
+++ b/tests/framework/instruments/MaliCounter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/MaliCounter.h b/tests/framework/instruments/MaliCounter.h
index 94ef93f..c2d1849 100644
--- a/tests/framework/instruments/MaliCounter.h
+++ b/tests/framework/instruments/MaliCounter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/Measurement.h b/tests/framework/instruments/Measurement.h
index 5c62977..af272a9 100644
--- a/tests/framework/instruments/Measurement.h
+++ b/tests/framework/instruments/Measurement.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/OpenCLMemoryUsage.cpp b/tests/framework/instruments/OpenCLMemoryUsage.cpp
index 7b08e2d..5ed2013 100644
--- a/tests/framework/instruments/OpenCLMemoryUsage.cpp
+++ b/tests/framework/instruments/OpenCLMemoryUsage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/OpenCLMemoryUsage.h b/tests/framework/instruments/OpenCLMemoryUsage.h
index 7593c01..07b6544 100644
--- a/tests/framework/instruments/OpenCLMemoryUsage.h
+++ b/tests/framework/instruments/OpenCLMemoryUsage.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/OpenCLTimer.cpp b/tests/framework/instruments/OpenCLTimer.cpp
index ca4c13c..45eb4c5 100644
--- a/tests/framework/instruments/OpenCLTimer.cpp
+++ b/tests/framework/instruments/OpenCLTimer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/OpenCLTimer.h b/tests/framework/instruments/OpenCLTimer.h
index b094ef4..9904035 100644
--- a/tests/framework/instruments/OpenCLTimer.h
+++ b/tests/framework/instruments/OpenCLTimer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/PMU.cpp b/tests/framework/instruments/PMU.cpp
index 053c70a..bb2c70e 100644
--- a/tests/framework/instruments/PMU.cpp
+++ b/tests/framework/instruments/PMU.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/PMU.h b/tests/framework/instruments/PMU.h
index ef4a9a0..c392a0a 100644
--- a/tests/framework/instruments/PMU.h
+++ b/tests/framework/instruments/PMU.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/PMUCounter.cpp b/tests/framework/instruments/PMUCounter.cpp
index df059fb..18821b5 100644
--- a/tests/framework/instruments/PMUCounter.cpp
+++ b/tests/framework/instruments/PMUCounter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/PMUCounter.h b/tests/framework/instruments/PMUCounter.h
index 0719b10..7dddbbf 100644
--- a/tests/framework/instruments/PMUCounter.h
+++ b/tests/framework/instruments/PMUCounter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/SchedulerTimer.cpp b/tests/framework/instruments/SchedulerTimer.cpp
index 9e8bba2..aa69bc2 100644
--- a/tests/framework/instruments/SchedulerTimer.cpp
+++ b/tests/framework/instruments/SchedulerTimer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,6 +63,11 @@
         _real_scheduler.set_num_threads(num_threads);
     }
 
+    void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) override
+    {
+        _real_scheduler.set_num_threads_with_affinity(num_threads, func);
+    }
+
     unsigned int num_threads() const override
     {
         return _real_scheduler.num_threads();
@@ -86,6 +91,19 @@
         _kernels.push_back(std::move(info));
     }
 
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, ITensorPack &tensors) override
+    {
+        _timer.start();
+        _real_scheduler.schedule_op(kernel, hints, tensors);
+        _timer.stop();
+
+        typename SchedulerClock<output_timestamps>::kernel_info info;
+        info.name         = kernel->name();
+        info.prefix       = _prefix;
+        info.measurements = _timer.measurements();
+        _kernels.push_back(std::move(info));
+    }
+
     void run_tagged_workloads(std::vector<Workload> &workloads, const char *tag) override
     {
         _timer.start();

diff --git a/tests/framework/instruments/SchedulerTimer.h b/tests/framework/instruments/SchedulerTimer.h
index ea64b22..aa948d3 100644
--- a/tests/framework/instruments/SchedulerTimer.h
+++ b/tests/framework/instruments/SchedulerTimer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/WallClockTimer.cpp b/tests/framework/instruments/WallClockTimer.cpp
index 0e21ac7..9063da0 100644
--- a/tests/framework/instruments/WallClockTimer.cpp
+++ b/tests/framework/instruments/WallClockTimer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/WallClockTimer.h b/tests/framework/instruments/WallClockTimer.h
index fb047aa..1ca4fe2 100644
--- a/tests/framework/instruments/WallClockTimer.h
+++ b/tests/framework/instruments/WallClockTimer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/hwc.hpp b/tests/framework/instruments/hwc.hpp
index 8c48e0c..5d12a40 100644
--- a/tests/framework/instruments/hwc.hpp
+++ b/tests/framework/instruments/hwc.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/instruments/hwc_names.hpp b/tests/framework/instruments/hwc_names.hpp
index cbcb0e7..e68bcbe 100644
--- a/tests/framework/instruments/hwc_names.hpp
+++ b/tests/framework/instruments/hwc_names.hpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/printers/JSONPrinter.cpp b/tests/framework/printers/JSONPrinter.cpp
index 179cdf9..0995ff3 100644
--- a/tests/framework/printers/JSONPrinter.cpp
+++ b/tests/framework/printers/JSONPrinter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/printers/JSONPrinter.h b/tests/framework/printers/JSONPrinter.h
index 3b783ac..ce587ad 100644
--- a/tests/framework/printers/JSONPrinter.h
+++ b/tests/framework/printers/JSONPrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/printers/PrettyPrinter.cpp b/tests/framework/printers/PrettyPrinter.cpp
index fe0540a..aa06eb9 100644
--- a/tests/framework/printers/PrettyPrinter.cpp
+++ b/tests/framework/printers/PrettyPrinter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/printers/PrettyPrinter.h b/tests/framework/printers/PrettyPrinter.h
index 95487f9..ded0da0 100644
--- a/tests/framework/printers/PrettyPrinter.h
+++ b/tests/framework/printers/PrettyPrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/printers/Printer.cpp b/tests/framework/printers/Printer.cpp
index e034c2e..12e2460 100644
--- a/tests/framework/printers/Printer.cpp
+++ b/tests/framework/printers/Printer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/printers/Printer.h b/tests/framework/printers/Printer.h
index cbe22fb..669b7f6 100644
--- a/tests/framework/printers/Printer.h
+++ b/tests/framework/printers/Printer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/printers/Printers.cpp b/tests/framework/printers/Printers.cpp
index 6e11b63..9ba3098 100644
--- a/tests/framework/printers/Printers.cpp
+++ b/tests/framework/printers/Printers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/framework/printers/Printers.h b/tests/framework/printers/Printers.h
index 53867e2..acbddf0 100644
--- a/tests/framework/printers/Printers.h
+++ b/tests/framework/printers/Printers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/instruments/Helpers.h b/tests/instruments/Helpers.h
index bc0ff7d..be463b2 100644
--- a/tests/instruments/Helpers.h
+++ b/tests/instruments/Helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/main.cpp b/tests/main.cpp
index 17342fc..f0d5df7 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validate_examples/RunExample.cpp b/tests/validate_examples/RunExample.cpp
index 998d501..b6ff883 100644
--- a/tests/validate_examples/RunExample.cpp
+++ b/tests/validate_examples/RunExample.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validate_examples/ValidateExample.h b/tests/validate_examples/ValidateExample.h
index ce67d7c..4e8dade 100644
--- a/tests/validate_examples/ValidateExample.h
+++ b/tests/validate_examples/ValidateExample.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validate_examples/cl_gemm.cpp b/tests/validate_examples/cl_gemm.cpp
index cdf60cd..3489584 100644
--- a/tests/validate_examples/cl_gemm.cpp
+++ b/tests/validate_examples/cl_gemm.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validate_examples/graph_convolution.cpp b/tests/validate_examples/graph_convolution.cpp
index 1148b8a..257bc5d 100644
--- a/tests/validate_examples/graph_convolution.cpp
+++ b/tests/validate_examples/graph_convolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validate_examples/graph_depthwiseconvolution.cpp b/tests/validate_examples/graph_depthwiseconvolution.cpp
index 7660e47..b2a77ba 100644
--- a/tests/validate_examples/graph_depthwiseconvolution.cpp
+++ b/tests/validate_examples/graph_depthwiseconvolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validate_examples/graph_fully_connected.cpp b/tests/validate_examples/graph_fully_connected.cpp
index fb85b65..4c1b593 100644
--- a/tests/validate_examples/graph_fully_connected.cpp
+++ b/tests/validate_examples/graph_fully_connected.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validate_examples/graph_validate_utils.h b/tests/validate_examples/graph_validate_utils.h
index edc3a65..36134a4 100644
--- a/tests/validate_examples/graph_validate_utils.h
+++ b/tests/validate_examples/graph_validate_utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/AbsLayer.cpp b/tests/validation/CL/AbsLayer.cpp
index 43fefad..e6ba14b 100644
--- a/tests/validation/CL/AbsLayer.cpp
+++ b/tests/validation/CL/AbsLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/AbsoluteDifference.cpp b/tests/validation/CL/AbsoluteDifference.cpp
index 2f739d0..b2f0280 100644
--- a/tests/validation/CL/AbsoluteDifference.cpp
+++ b/tests/validation/CL/AbsoluteDifference.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Accumulate.cpp b/tests/validation/CL/Accumulate.cpp
index 0ec89b8..ee2d252 100644
--- a/tests/validation/CL/Accumulate.cpp
+++ b/tests/validation/CL/Accumulate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ActivationLayer.cpp b/tests/validation/CL/ActivationLayer.cpp
index b32e379..1fef384 100644
--- a/tests/validation/CL/ActivationLayer.cpp
+++ b/tests/validation/CL/ActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ArgMinMax.cpp b/tests/validation/CL/ArgMinMax.cpp
index e5decb8..7dcd22e 100644
--- a/tests/validation/CL/ArgMinMax.cpp
+++ b/tests/validation/CL/ArgMinMax.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -182,6 +182,31 @@
     validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLArgMinMaxQuantizedValidationFixture<int8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(ArgMinMaxSmallDataset, framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
+                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
+                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       CLArgMinMaxQuantizedValidationFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(ArgMinMaxLargeDataset, framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
+                                       framework::dataset::make("Operation", { ReductionOperation::ARG_IDX_MIN, ReductionOperation::ARG_IDX_MAX })),
+                               framework::dataset::make("QuantizationInfo", { QuantizationInfo(5.f / 255.f, 20) })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
 TEST_SUITE_END() // Quantized
 TEST_SUITE_END() // ArgMinMax
 TEST_SUITE_END() // CL

diff --git a/tests/validation/CL/ArithmeticAddition.cpp b/tests/validation/CL/ArithmeticAddition.cpp
index 41415ee..93faa7e 100644
--- a/tests/validation/CL/ArithmeticAddition.cpp
+++ b/tests/validation/CL/ArithmeticAddition.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,6 @@
 {
 namespace
 {
-constexpr unsigned int num_elems_processed_per_iteration = 16;
 /** Input data sets **/
 const auto ArithmeticAdditionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
                                                  DataType::U8));
@@ -103,34 +102,36 @@
 // clang-format on
 // *INDENT-ON*
 
+/** Validate fused activation expecting the following behaviours:
+ *
+ * - Fused activation with float data type should succeed
+ * - Fused activation with quantized data type should fail
+ *
+ */
+TEST_CASE(FusedActivation, framework::DatasetMode::ALL)
+{
+    auto   input  = TensorInfo{ TensorShape(2U, 2U), 1, DataType::F32 };
+    auto   output = TensorInfo{ TensorShape(2U, 2U), 1, DataType::F32 };
+    Status result{};
+
+    const auto act_info = ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU);
+
+    // Fused-activation float type
+    result = CLArithmeticAddition::validate(&input, &input, &output, ConvertPolicy::WRAP, act_info);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+
+    // Fused-activation quantized type
+    input.set_data_type(DataType::QASYMM8);
+    output.set_data_type(DataType::QASYMM8);
+    result = CLArithmeticAddition::validate(&input, &input, &output, ConvertPolicy::WRAP, act_info);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
 template <typename T>
 using CLArithmeticAdditionFixture = ArithmeticAdditionValidationFixture<CLTensor, CLAccessor, CLArithmeticAddition, T>;
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-               shape, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::U8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::U8);
-
-    // Create and Configure function
-    CLArithmeticAddition add;
-    add.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionU8Dataset),
                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
 {
@@ -140,30 +141,6 @@
 TEST_SUITE_END() // U8
 
 TEST_SUITE(S16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", { DataType::U8, DataType::S16 })),
-                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-               shape, data_type, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, data_type);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::S16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::S16);
-
-    // Create and Configure function
-    CLArithmeticAddition add;
-    add.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionS16Dataset),
                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
 {
@@ -185,29 +162,6 @@
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-               shape, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QASYMM8);
-
-    // Create and Configure function
-    CLArithmeticAddition add;
-    add.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ArithmeticAdditionQASYMM8Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
@@ -220,29 +174,6 @@
 }
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-               shape, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QASYMM8_SIGNED);
-
-    // Create and Configure function
-    CLArithmeticAddition add;
-    add.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ArithmeticAdditionQASYMM8SignedDataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
@@ -255,29 +186,6 @@
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE(QSYMM16)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-               shape, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::QSYMM16);
-
-    // Create and Configure function
-    CLArithmeticAddition add;
-    add.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ArithmeticAdditionQSYMM16Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
@@ -313,29 +221,6 @@
 TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-               shape, policy)
-{
-    // Create tensors
-    CLTensor ref_src1 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor ref_src2 = create_tensor<CLTensor>(shape, DataType::F32);
-    CLTensor dst      = create_tensor<CLTensor>(shape, DataType::F32);
-
-    // Create and Configure function
-    CLArithmeticAddition add;
-    add.configure(&ref_src1, &ref_src2, &dst, policy);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(shape);
-    validate(dst.info()->valid_region(), valid_region);
-
-    // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), num_elems_processed_per_iteration).required_padding();
-    validate(ref_src1.info()->padding(), padding);
-    validate(ref_src2.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
-}
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticAdditionFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticAdditionFP32Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
                                                                                                                      EmptyActivationFunctionsDataset))

diff --git a/tests/validation/CL/ArithmeticDivision.cpp b/tests/validation/CL/ArithmeticDivision.cpp
index d970c31..82a0ec5 100644
--- a/tests/validation/CL/ArithmeticDivision.cpp
+++ b/tests/validation/CL/ArithmeticDivision.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ArithmeticSubtraction.cpp b/tests/validation/CL/ArithmeticSubtraction.cpp
index 897ae1a..52d787f 100644
--- a/tests/validation/CL/ArithmeticSubtraction.cpp
+++ b/tests/validation/CL/ArithmeticSubtraction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,6 +70,8 @@
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.75f, 0.25f),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.75f, 0.25f)
 });
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(CL)
@@ -104,6 +106,58 @@
 // clang-format on
 // *INDENT-ON*
 
+TEST_SUITE(InPlaceValidate)
+TEST_CASE(SingleTensor, framework::DatasetMode::ALL)
+{
+    const auto random_shape       = TensorShape{ 9, 9 };
+    const auto single_tensor_info = TensorInfo{ random_shape, 1, DataType::F32 };
+
+    Status result = CLArithmeticSubtraction::validate(&single_tensor_info, &single_tensor_info, &single_tensor_info, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(ValidBroadCast, framework::DatasetMode::ALL)
+{
+    const auto larger_shape  = TensorShape{ 27U, 13U, 2U };
+    const auto smaller_shape = TensorShape{ 1U, 13U, 2U };
+
+    const auto larger_tensor_info  = TensorInfo{ larger_shape, 1, DataType::F32 };
+    const auto smaller_tensor_info = TensorInfo{ smaller_shape, 1, DataType::F32 };
+
+    Status result = CLArithmeticSubtraction::validate(&larger_tensor_info, &smaller_tensor_info, &larger_tensor_info, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(InvalidBroadcastOutput, framework::DatasetMode::ALL)
+{
+    const auto larger_shape  = TensorShape{ 27U, 13U, 2U };
+    const auto smaller_shape = TensorShape{ 1U, 13U, 2U };
+
+    const auto larger_tensor_info  = TensorInfo{ larger_shape, 1, DataType::F32 };
+    const auto smaller_tensor_info = TensorInfo{ smaller_shape, 1, DataType::F32 };
+
+    Status result = CLArithmeticSubtraction::validate(&larger_tensor_info, &smaller_tensor_info, &smaller_tensor_info, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(InvalidBroadcastBoth, framework::DatasetMode::ALL)
+{
+    const auto shape0 = TensorShape{ 9U, 9U };
+    const auto shape1 = TensorShape{ 9U, 1U, 2U };
+
+    const auto info0 = TensorInfo{ shape0, 1, DataType::F32 };
+    const auto info1 = TensorInfo{ shape1, 1, DataType::F32 };
+
+    Status result{};
+
+    result = CLArithmeticSubtraction::validate(&info0, &info1, &info0, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+
+    result = CLArithmeticSubtraction::validate(&info0, &info1, &info1, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // InPlaceValidate
+
 template <typename T>
 using CLArithmeticSubtractionFixture = ArithmeticSubtractionValidationFixture<CLTensor, CLAccessor, CLArithmeticSubtraction, T>;
 
@@ -132,8 +186,9 @@
     validate(dst.info()->padding(), padding);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionU8Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionU8Dataset),
+                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                     OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -165,15 +220,17 @@
     validate(dst.info()->padding(), padding);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS16Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS16Dataset),
+                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                     OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS16Dataset),
-                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS16Dataset),
+                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                   OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -209,12 +266,13 @@
     validate(dst.info()->padding(), padding);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ArithmeticSubtractionQASYMM8Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -244,12 +302,13 @@
     validate(dst.info()->padding(), padding);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ArithmeticSubtractionQASYMM8SignedDataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 10) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -279,12 +338,13 @@
     validate(dst.info()->padding(), padding);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallShapes(),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                        ArithmeticSubtractionQSYMM16Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
                        framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
                        framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0), QuantizationInfo(5.f / 32768.f, 0) })),
-                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })))
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -297,16 +357,19 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
                                                                                                                  framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                 EmptyActivationFunctionsDataset))
+                                                                                                                 EmptyActivationFunctionsDataset),
+                                                                                                                 OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticSubtractionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ArithmeticSubtractionFP16Dataset),
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticSubtractionFloatFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::TinyShapes(),
+                       ArithmeticSubtractionFP16Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                       ActivationFunctionsDataset))
+                       ActivationFunctionsDataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -337,24 +400,30 @@
     validate(dst.info()->padding(), padding);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLArithmeticSubtractionFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                        ArithmeticSubtractionFP32Dataset),
                                                                                                                         framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                        EmptyActivationFunctionsDataset))
+                                                                                                                        EmptyActivationFunctionsDataset),
+                                                                                                                        OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticSubtractionFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapes(), ArithmeticSubtractionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunWithActivation, CLArithmeticSubtractionFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::TinyShapes(),
+                       ArithmeticSubtractionFP32Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                       ActivationFunctionsDataset))
+                       ActivationFunctionsDataset),
+                       InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticSubtractionFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionFP32Dataset),
+FIXTURE_DATA_TEST_CASE(RunLarge, CLArithmeticSubtractionFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapes(),
+                                                                                                                      ArithmeticSubtractionFP32Dataset),
                                                                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                                                                                                                      EmptyActivationFunctionsDataset))
+                                                                                                                      EmptyActivationFunctionsDataset),
+                                                                                                                      OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -363,27 +432,30 @@
 template <typename T>
 using CLArithmeticSubtractionBroadcastFloatFixture = ArithmeticSubtractionBroadcastValidationFloatFixture<CLTensor, CLAccessor, CLArithmeticSubtraction, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticSubtractionBroadcastFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, CLArithmeticSubtractionBroadcastFloatFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
                        ArithmeticSubtractionFP32Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLArithmeticSubtractionBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::TinyShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunWithActivationBroadcast, CLArithmeticSubtractionBroadcastFloatFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::TinyShapesBroadcast(),
                        ArithmeticSubtractionFP32Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                       ActivationFunctionsDataset))
+                       ActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, CLArithmeticSubtractionBroadcastFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, CLArithmeticSubtractionBroadcastFloatFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeShapesBroadcast(),
                        ArithmeticSubtractionFP32Dataset),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
-                       EmptyActivationFunctionsDataset))
+                       EmptyActivationFunctionsDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);

diff --git a/tests/validation/CL/BatchConcatenateLayer.cpp b/tests/validation/CL/BatchConcatenateLayer.cpp
index 6c4ffee..6fd189b 100644
--- a/tests/validation/CL/BatchConcatenateLayer.cpp
+++ b/tests/validation/CL/BatchConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,7 +70,7 @@
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {
@@ -97,8 +97,8 @@
     ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
 
     // Create and configure function
-    CLConcatenateLayer       concat_layer;
-    std::vector<ICLTensor *> inputs;
+    CLConcatenateLayer             concat_layer;
+    std::vector<const ICLTensor *> inputs;
     inputs.emplace_back(&src1);
     inputs.emplace_back(&src2);
     inputs.emplace_back(&src3);

diff --git a/tests/validation/CL/BatchNormalizationLayer.cpp b/tests/validation/CL/BatchNormalizationLayer.cpp
index dee703e..cb17204 100644
--- a/tests/validation/CL/BatchNormalizationLayer.cpp
+++ b/tests/validation/CL/BatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/BatchToSpaceLayer.cpp b/tests/validation/CL/BatchToSpaceLayer.cpp
index 30b7817..f553787 100644
--- a/tests/validation/CL/BatchToSpaceLayer.cpp
+++ b/tests/validation/CL/BatchToSpaceLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/BitwiseAnd.cpp b/tests/validation/CL/BitwiseAnd.cpp
index 2aa15f6..76db5bb 100644
--- a/tests/validation/CL/BitwiseAnd.cpp
+++ b/tests/validation/CL/BitwiseAnd.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/BitwiseNot.cpp b/tests/validation/CL/BitwiseNot.cpp
index 35e0d3d..d3b1c5d 100644
--- a/tests/validation/CL/BitwiseNot.cpp
+++ b/tests/validation/CL/BitwiseNot.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/BitwiseOr.cpp b/tests/validation/CL/BitwiseOr.cpp
index 5e24b5e..585170f 100644
--- a/tests/validation/CL/BitwiseOr.cpp
+++ b/tests/validation/CL/BitwiseOr.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/BitwiseXor.cpp b/tests/validation/CL/BitwiseXor.cpp
index 35b2b0d..bd7ff5b 100644
--- a/tests/validation/CL/BitwiseXor.cpp
+++ b/tests/validation/CL/BitwiseXor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/BoundingBoxTransform.cpp b/tests/validation/CL/BoundingBoxTransform.cpp
index 2491e18..82dfa31 100644
--- a/tests/validation/CL/BoundingBoxTransform.cpp
+++ b/tests/validation/CL/BoundingBoxTransform.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Box3x3.cpp b/tests/validation/CL/Box3x3.cpp
index 28ec6cd..8d79189 100644
--- a/tests/validation/CL/Box3x3.cpp
+++ b/tests/validation/CL/Box3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/CannyEdge.cpp b/tests/validation/CL/CannyEdge.cpp
index f2944f5..f8cf9f0 100644
--- a/tests/validation/CL/CannyEdge.cpp
+++ b/tests/validation/CL/CannyEdge.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Cast.cpp b/tests/validation/CL/Cast.cpp
index 854290d..a283aec 100644
--- a/tests/validation/CL/Cast.cpp
+++ b/tests/validation/CL/Cast.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ChannelCombine.cpp b/tests/validation/CL/ChannelCombine.cpp
index dca8817..6187e72 100644
--- a/tests/validation/CL/ChannelCombine.cpp
+++ b/tests/validation/CL/ChannelCombine.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ChannelExtract.cpp b/tests/validation/CL/ChannelExtract.cpp
index 45c0fa7..7657d5a 100644
--- a/tests/validation/CL/ChannelExtract.cpp
+++ b/tests/validation/CL/ChannelExtract.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ChannelShuffle.cpp b/tests/validation/CL/ChannelShuffle.cpp
index 8b5a4f5..8c06e6b 100644
--- a/tests/validation/CL/ChannelShuffle.cpp
+++ b/tests/validation/CL/ChannelShuffle.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Col2Im.cpp b/tests/validation/CL/Col2Im.cpp
index 88aa787..d6ef010 100644
--- a/tests/validation/CL/Col2Im.cpp
+++ b/tests/validation/CL/Col2Im.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ColorConvert.cpp b/tests/validation/CL/ColorConvert.cpp
index d4322fc..37957cd 100644
--- a/tests/validation/CL/ColorConvert.cpp
+++ b/tests/validation/CL/ColorConvert.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Comparisons.cpp b/tests/validation/CL/Comparisons.cpp
index e5b07fd..fb8935b 100644
--- a/tests/validation/CL/Comparisons.cpp
+++ b/tests/validation/CL/Comparisons.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ConvertFullyConnectedWeights.cpp b/tests/validation/CL/ConvertFullyConnectedWeights.cpp
index ce51e41..a5065fb 100644
--- a/tests/validation/CL/ConvertFullyConnectedWeights.cpp
+++ b/tests/validation/CL/ConvertFullyConnectedWeights.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Convolution.cpp b/tests/validation/CL/Convolution.cpp
index 9c33d45..8804d34 100644
--- a/tests/validation/CL/Convolution.cpp
+++ b/tests/validation/CL/Convolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index 20ebc4d..8c40b7e 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -262,6 +262,19 @@
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE(QSYMM8_PER_CHANNEL)
 
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, CLGEMMConvolutionLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                               framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                               QuantizationData),
+                                       QuantizedActivationFunctionsSmallDataset),
+                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                                                                        framework::dataset::make("ReshapeWeights", { true })),

diff --git a/tests/validation/CL/Copy.cpp b/tests/validation/CL/Copy.cpp
index 34ba3ab..07af243 100644
--- a/tests/validation/CL/Copy.cpp
+++ b/tests/validation/CL/Copy.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/CropResize.cpp b/tests/validation/CL/CropResize.cpp
index cacf405..636db17 100644
--- a/tests/validation/CL/CropResize.cpp
+++ b/tests/validation/CL/CropResize.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,6 @@
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(15U, 30U, 40U, 10U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(15U, 30U, 40U, 10U), 1, DataType::U8),  // Invalid input data type.
                                                        TensorInfo(TensorShape(15U, 30U, 40U, 10U), 1, DataType::S32), // Invalid box_ind shape.
                                                        TensorInfo(TensorShape(15U, 30U, 40U, 10U), 1, DataType::S32), // Invalid output shape.
                                                        TensorInfo(TensorShape(15U, 30U, 40U, 10U), 1, DataType::S32), // Invalid output data type.
@@ -64,11 +63,9 @@
                                                        TensorInfo(TensorShape(4, 20), 1, DataType::F32),
                                                        TensorInfo(TensorShape(4, 20), 1, DataType::F32),
                                                        TensorInfo(TensorShape(4, 20), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(4, 20), 1, DataType::F32),
                                                        TensorInfo(TensorShape(3, 20), 1, DataType::F32),
                                                      })),
                framework::dataset::make("BoxIndInfo",{ TensorInfo(TensorShape(20), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(20), 1, DataType::S32),
                                                        TensorInfo(TensorShape(10), 1, DataType::S32),
                                                        TensorInfo(TensorShape(20), 1, DataType::S32),
                                                        TensorInfo(TensorShape(20), 1, DataType::S32),
@@ -77,13 +74,12 @@
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(15U, 5, 5, 20U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(15U, 5, 5, 20U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(15U, 5, 5, 20U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(15U, 5, 5, 10U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(15U, 5, 5, 20U), 1, DataType::S32),
                                                        TensorInfo(TensorShape(5U, 5, 5, 20U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(15U, 5, 5, 20U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, false, false, false, false, false, false})),
+               framework::dataset::make("Expected", { true, false, false, false, false, false})),
                input, boxes, box_ind, output, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLCropResize::validate(&input.clone()->set_data_layout(DataLayout::NHWC).set_is_resizable(false),
@@ -125,6 +121,19 @@
 TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLCropResizeFixture<uint8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(datasets::SmallCropResizeDataset(),
+                               combine(framework::dataset::make("IsOutOfBounds", { true, false }),
+                                       framework::dataset::make("DataType", DataType::U8))))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.01);
+}
+TEST_SUITE_END() // U8
+
 TEST_SUITE(U16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLCropResizeFixture<uint16_t>,

diff --git a/tests/validation/CL/DeconvolutionLayer.cpp b/tests/validation/CL/DeconvolutionLayer.cpp
index dd92887..c677f5a 100644
--- a/tests/validation/CL/DeconvolutionLayer.cpp
+++ b/tests/validation/CL/DeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/DepthConcatenateLayer.cpp b/tests/validation/CL/DepthConcatenateLayer.cpp
index c67ed05..4f5bd11 100644
--- a/tests/validation/CL/DepthConcatenateLayer.cpp
+++ b/tests/validation/CL/DepthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,7 +67,7 @@
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {
@@ -94,8 +94,8 @@
     ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
 
     // Create and configure function
-    CLConcatenateLayer       concat_layer;
-    std::vector<ICLTensor *> inputs;
+    CLConcatenateLayer             concat_layer;
+    std::vector<const ICLTensor *> inputs;
     inputs.emplace_back(&src1);
     inputs.emplace_back(&src2);
     inputs.emplace_back(&src3);

diff --git a/tests/validation/CL/DepthConvertLayer.cpp b/tests/validation/CL/DepthConvertLayer.cpp
index 24b2297..c6595e4 100644
--- a/tests/validation/CL/DepthConvertLayer.cpp
+++ b/tests/validation/CL/DepthConvertLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/DepthToSpaceLayer.cpp b/tests/validation/CL/DepthToSpaceLayer.cpp
index 8a8ed62..fd570ad 100644
--- a/tests/validation/CL/DepthToSpaceLayer.cpp
+++ b/tests/validation/CL/DepthToSpaceLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index 07b1235..c779092 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
index a3a5df1..058d9b3 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/DequantizationLayer.cpp b/tests/validation/CL/DequantizationLayer.cpp
index acc0022..fa283c9 100644
--- a/tests/validation/CL/DequantizationLayer.cpp
+++ b/tests/validation/CL/DequantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Derivative.cpp b/tests/validation/CL/Derivative.cpp
index 964a0b9..284da20 100644
--- a/tests/validation/CL/Derivative.cpp
+++ b/tests/validation/CL/Derivative.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Dilate.cpp b/tests/validation/CL/Dilate.cpp
index 6b1904e..e6605bb 100644
--- a/tests/validation/CL/Dilate.cpp
+++ b/tests/validation/CL/Dilate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/DilatedConvolutionLayer.cpp b/tests/validation/CL/DilatedConvolutionLayer.cpp
index 9d35be4..20ba113 100644
--- a/tests/validation/CL/DilatedConvolutionLayer.cpp
+++ b/tests/validation/CL/DilatedConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp
index 3c39151..090bd22 100644
--- a/tests/validation/CL/DirectConvolutionLayer.cpp
+++ b/tests/validation/CL/DirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,11 +43,11 @@
 {
 namespace
 {
-// COMPMID-517 Investigate the mismatch to see whether it is a real bug
-RelativeTolerance<half>              tolerance_fp16(half(0.2)); /**< Tolerance for floating point tests */
-RelativeTolerance<float>             tolerance_fp32(0.02f);     /**< Tolerance for floating point tests */
-constexpr float                      tolerance_num = 0.07f;     /**< Tolerance number */
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);      /**< Tolerance for quantized tests */
+RelativeTolerance<half>              tolerance_fp16(half(0.2));   /**< Tolerance for floating point tests */
+RelativeTolerance<float>             tolerance_fp32(0.05f);       /**< Tolerance for floating point tests */
+AbsoluteTolerance<float>             tolerance_fp32_abs(0.0003f); /**< Absolute Tolerance for floating point tests */
+constexpr float                      tolerance_num = 0.07f;       /**< Tolerance number */
+constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);        /**< Tolerance for quantized tests */
 
 const auto data_strides          = combine(framework::dataset::make("StrideX", 1, 3), framework::dataset::make("StrideY", 1, 3));
 const auto data_strides_small    = combine(framework::dataset::make("StrideX", 1), framework::dataset::make("StrideY", 1));
@@ -66,8 +66,16 @@
 const auto data_small9x9 = combine(datasets::SmallDirectConvolutionShapes(), combine(data_strides_small, data_ksize_nine_small));
 
 /** Direct convolution nightly data set. */
-const auto data_nightly     = combine(data, framework::dataset::make("NumKernels", { 1, 4 }));
-const auto data_nightly_9x9 = combine(data9x9, framework::dataset::make("NumKernels", { 1, 4 }));
+const auto data_nightly         = combine(data, framework::dataset::make("NumKernels", { 1, 4 }));
+const auto data_nightly_9x9     = combine(data9x9, framework::dataset::make("NumKernels", { 1, 4 }));
+const auto data_nightly_usecase = combine(framework::dataset::make("InputShape", { TensorShape{ 3U, 800U, 800U } }),
+                                          combine(framework::dataset::make("StrideX", { 1 }),
+                                                  combine(framework::dataset::make("StrideY", { 1 }),
+                                                          combine(framework::dataset::make("PadX", { 4 }),
+                                                                  combine(framework::dataset::make("PadY", { 4 }),
+                                                                          combine(framework::dataset::make("KernelSize", 9),
+                                                                                  framework::dataset::make("NumKernels", { 16 })))))));
+
 /** Direct convolution precommit data set. */
 const auto data_precommit     = combine(data_small, framework::dataset::make("NumKernels", { 1 }));
 const auto data_precommit_9x9 = combine(data_small9x9, framework::dataset::make("NumKernels", { 1 }));
@@ -80,8 +88,6 @@
 TEST_SUITE(CL)
 TEST_SUITE(DirectConvolutionLayer)
 
-//TODO(COMPMID-415): Configuration tests?
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
@@ -223,6 +229,15 @@
 {
     validate(CLAccessor(_target), _reference, tolerance_fp32);
 }
+
+FIXTURE_DATA_TEST_CASE(RunLargeUsecase, CLDirectConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly_usecase, framework::dataset::make("DataType",
+                       DataType::F32)),
+                       framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_fp32, 0.f, tolerance_fp32_abs);
+}
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP32_CustomDataset)
@@ -264,7 +279,7 @@
                                                 DataType::QASYMM8)),
                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255, 10), QuantizationInfo(1.1f, 10) })),
                        QuantizedActivationFunctionsDataset),
-                       framework::dataset::make("DataLayout", { DataLayout::NCHW })))
+                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);

diff --git a/tests/validation/CL/ElementwiseMax.cpp b/tests/validation/CL/ElementwiseMax.cpp
index 879e732..bdc47ee 100644
--- a/tests/validation/CL/ElementwiseMax.cpp
+++ b/tests/validation/CL/ElementwiseMax.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ElementwiseMin.cpp b/tests/validation/CL/ElementwiseMin.cpp
index 332fa80..a7caac3 100644
--- a/tests/validation/CL/ElementwiseMin.cpp
+++ b/tests/validation/CL/ElementwiseMin.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ElementwisePower.cpp b/tests/validation/CL/ElementwisePower.cpp
index ce4fc80..2cafdbb 100644
--- a/tests/validation/CL/ElementwisePower.cpp
+++ b/tests/validation/CL/ElementwisePower.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ElementwiseSquaredDiff.cpp b/tests/validation/CL/ElementwiseSquaredDiff.cpp
index 86fdc21..58eca3f 100644
--- a/tests/validation/CL/ElementwiseSquaredDiff.cpp
+++ b/tests/validation/CL/ElementwiseSquaredDiff.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/EqualizeHistogram.cpp b/tests/validation/CL/EqualizeHistogram.cpp
index 19d9d63..6dbe01a 100644
--- a/tests/validation/CL/EqualizeHistogram.cpp
+++ b/tests/validation/CL/EqualizeHistogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Erode.cpp b/tests/validation/CL/Erode.cpp
index 5dc7bfb..76d30af 100644
--- a/tests/validation/CL/Erode.cpp
+++ b/tests/validation/CL/Erode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ExpLayer.cpp b/tests/validation/CL/ExpLayer.cpp
index d9da755..16e75a6 100644
--- a/tests/validation/CL/ExpLayer.cpp
+++ b/tests/validation/CL/ExpLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/FFT.cpp b/tests/validation/CL/FFT.cpp
index 9fdd85b..12d53b5 100644
--- a/tests/validation/CL/FFT.cpp
+++ b/tests/validation/CL/FFT.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/FastCorners.cpp b/tests/validation/CL/FastCorners.cpp
index 93af59d..37ffb51 100644
--- a/tests/validation/CL/FastCorners.cpp
+++ b/tests/validation/CL/FastCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Fill.cpp b/tests/validation/CL/Fill.cpp
index 26f9872..b86dae1 100644
--- a/tests/validation/CL/Fill.cpp
+++ b/tests/validation/CL/Fill.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/FillBorder.cpp b/tests/validation/CL/FillBorder.cpp
index a817338..e0b283b 100644
--- a/tests/validation/CL/FillBorder.cpp
+++ b/tests/validation/CL/FillBorder.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Flatten.cpp b/tests/validation/CL/Flatten.cpp
index ceaf123..a00041b 100644
--- a/tests/validation/CL/Flatten.cpp
+++ b/tests/validation/CL/Flatten.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Floor.cpp b/tests/validation/CL/Floor.cpp
index ef53b41..58645b9 100644
--- a/tests/validation/CL/Floor.cpp
+++ b/tests/validation/CL/Floor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/FullyConnectedLayer.cpp b/tests/validation/CL/FullyConnectedLayer.cpp
index 50094f1..78195a5 100644
--- a/tests/validation/CL/FullyConnectedLayer.cpp
+++ b/tests/validation/CL/FullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/FuseBatchNormalization.cpp b/tests/validation/CL/FuseBatchNormalization.cpp
index 35414b7..0736250 100644
--- a/tests/validation/CL/FuseBatchNormalization.cpp
+++ b/tests/validation/CL/FuseBatchNormalization.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GEMM.cpp b/tests/validation/CL/GEMM.cpp
index 8fce006..c9540c3 100644
--- a/tests/validation/CL/GEMM.cpp
+++ b/tests/validation/CL/GEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
index 41a441c..29649d8 100644
--- a/tests/validation/CL/GEMMLowp.cpp
+++ b/tests/validation/CL/GEMMLowp.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
index 86c8f5a..ce000bd 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
index 96e41b3..16e4a13 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshaped.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
index f4083bf..d8618bd 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyReshapedOnlyRHS.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GEMMMatrixMultiply.cpp b/tests/validation/CL/GEMMMatrixMultiply.cpp
index e0c1a44..e521dd5 100644
--- a/tests/validation/CL/GEMMMatrixMultiply.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiply.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp b/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp
index 63e1b57..fcbf8ce 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyInterleavedTransposed.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
index c060a8b..6ba5012 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -118,6 +118,28 @@
 /** Broadcast bias from vector to matrix */
 const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", { false, true } );
 
+/** Boundary handling cases for testing partial/non-partial (full) block dimensions, resulting from different combinations
+ * of M, M0, N and N0 values.
+ * M0 and N0 are kept constant, while the different test cases need to vary M and N.
+ *
+ * Eg. M = 64 and N = 33 result in a block dimension that has no partial blocks (all full blocks) in Y dimension and
+ * parital blocks in X dimension.
+ */
+const auto boundary_handling_cases = combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                    // Large k to force potential out-of-bound reads on input0
+                                    framework::dataset::make("K", 315),
+                                    // Batch size == 1 to force potential out-of-bound reads on input0
+                                    framework::dataset::make("batch_size", 1)),
+                                    framework::dataset::make("M0", 4)),
+                                    framework::dataset::make("N0", 4)),
+                                    framework::dataset::make("K0", 4)),
+                                    // Only need to test F32 as F16 shares identical boundary handling logics
+                                    framework::dataset::make("DataType", DataType::F32)),
+                                    framework::dataset::make("alpha", -0.75f )),
+                                    framework::dataset::make("beta", -0.35f )),
+                                    broadcast_bias_values),
+                                    framework::dataset::make("Activation", ActivationLayerInfo()));
+
 /** Configuration test */
 void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info)
 {
@@ -164,6 +186,55 @@
     CLGEMMMatrixMultiplyNative gemm;
     gemm.configure(&lhs, &rhs, &bias, &dst, 1.0f, 1.0f, lhs_info, rhs_info, kernel_info);
 }
+/** Zero padding test */
+bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info)
+{
+    const unsigned int M = m_value;
+    const unsigned int N = n_value;
+    const unsigned int K = k_value;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0         = m0_value;
+    lhs_info.k0         = k0_value;
+
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0         = n0_value;
+    rhs_info.k0         = k0_value;
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m               = M;
+    kernel_info.n               = N;
+    kernel_info.k               = K;
+    kernel_info.broadcast_bias  = broadcast_bias;
+    kernel_info.activation_info = act_info;
+
+    const TensorShape lhs_shape(K, M, b_value);
+    const TensorShape rhs_shape(N, K, b_value);
+    const TensorShape bias_shape(N,
+                                 broadcast_bias? 1 : M,
+                                 broadcast_bias? 1 : b_value);
+    const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape, 1, data_type),
+                                                   TensorInfo(rhs_shape, 1, data_type),
+                                                   kernel_info);
+
+    // Create tensors
+    CLTensor lhs  = create_tensor<CLTensor>(lhs_shape, data_type);
+    CLTensor rhs  = create_tensor<CLTensor>(rhs_shape, data_type);
+    CLTensor bias = create_tensor<CLTensor>(bias_shape, data_type);
+    CLTensor dst  = create_tensor<CLTensor>(dst_shape, data_type);
+
+    ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    CLGEMMMatrixMultiplyNative gemm;
+    gemm.configure(&lhs, &rhs, &bias, &dst, 1.0f, 1.0f, lhs_info, rhs_info, kernel_info);
+
+    // Padding can be added along rhs and bias's X dimension
+    return dst.info()->padding().empty() && lhs.info()->padding().empty() && bias.info()->padding().bottom == 0 && bias.info()->padding().top == 0;
+}
 } // namespace
 
 TEST_SUITE(CL)
@@ -185,6 +256,69 @@
     validate_configuration(m_value, n_value, k_value, b_value, m0_value, n0_value, k0_value, broadcast_bias, DataType::F32, act_value);
 }
 
+/** Validate zero padding tests
+ *
+ * A series of validation tests to check that no padding is added as part of configuration for 4 different scenarios.
+ *
+ * Checks performed in order:
+ *     - No partial blocks in both x and y dimensions
+ *     - Partial blocks in x dimension
+ *     - Partial blocks in y dimension
+ *     - Partial blocks in both x and y dimensions
+ *     - No blocks in both x and y dimensions, scalar store (N0==1)
+ *     - Special case: partial_n0 == 5 (vstore1 should be invoked instead of vstore_partial_1)
+ */
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(
+framework::dataset::make("M",                   { 24, 64, 101,   1, 50, 256, }),
+framework::dataset::make("N",                   { 48, 29,  16, 122, 20,  21, })),
+framework::dataset::make("M0",                  { 4,   8,   7,   2,  1,   8, })),
+framework::dataset::make("N0",                  { 4,   4,  16,   3,  1,   8, })),
+m_value, n_value, m0_value, n0_value)
+{
+    bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, false, DataType::F32, ActivationLayerInfo());
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBoundaryHandlingPartialInXPartialInY, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(
+                        framework::dataset::make("M", 3),
+                        framework::dataset::make("N", 1)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBoundaryHandlingPartialInXFullInY, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(
+                        framework::dataset::make("M", 64),
+                        framework::dataset::make("N", 51)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBoundaryHandlingFullInXFullInY, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(
+                        framework::dataset::make("M", 64),
+                        framework::dataset::make("N", 32)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBoundaryHandlingFullInXPartialInY, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(
+                        framework::dataset::make("M", 37),
+                        framework::dataset::make("N", 32)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,

diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
index 833a924..d7853f3 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -147,6 +147,12 @@
 /** K0 values to test - Nightly */
 const auto k0_values_nightly = framework::dataset::make("K0", { 2, 3, 4, 8 });
 
+/** N0 values to test with export to OpenCL image object - Nightly */
+const auto n0_export_to_cl_image_values_nightly = framework::dataset::make("N0", { 4, 8, 16 });
+
+/** K0 values to test with export to OpenCL image object - Nightly */
+const auto k0_export_to_cl_image_values_nightly = framework::dataset::make("K0", { 4, 8, 16 });
+
 /** V0 values to test - Nightly */
 const auto v0_values_nightly = framework::dataset::make("V0", 1, 4);
 
@@ -164,11 +170,101 @@
 
 /** LHS transposed values */
 const auto lhs_transpose_values = framework::dataset::make("lhs_transpose", { false, true } );
+
+/** Zero padding test */
+bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
+                            unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
+                            bool i_value_rhs, bool t_value_rhs, bool export_to_cl_image, bool broadcast_bias, unsigned int depth_output_gemm3d, const ActivationLayerInfo &act_info,
+                            DataType dt_input0, DataType dt_input1, DataType dt_input2, DataType dt_output, float alpha, float beta)
+{
+    const unsigned int M = m_value;
+    const unsigned int N = n_value;
+    const unsigned int K = k_value;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0         = m0_value;
+    lhs_info.k0         = k0_value;
+
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0         = n0_value;
+    rhs_info.k0         = k0_value;
+    rhs_info.h0         = h0_value;
+    rhs_info.interleave = i_value_rhs;
+    rhs_info.transpose  = t_value_rhs;
+    rhs_info.export_to_cl_image = export_to_cl_image;
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = M;
+    kernel_info.n                       = N;
+    kernel_info.k                       = K;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = false;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = act_info;
+
+    const TensorShape lhs_shape(K, M, b_value);
+    const TensorShape rhs_shape(N, K, b_value);
+    const TensorShape lhs_shape_reshaped = compute_lhs_reshaped_shape(TensorInfo(lhs_shape, 1, dt_input0),
+                                                                      lhs_info);
+    const TensorShape rhs_shape_reshaped = compute_rhs_reshaped_shape(TensorInfo(rhs_shape, 1, dt_input1),
+                                                                      rhs_info);
+
+    const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape_reshaped, 1, dt_input0),
+                                                   TensorInfo(rhs_shape_reshaped, 1, dt_input1),
+                                                   kernel_info);
+
+    const TensorShape bias_shape(N,
+                                 M, // Correct calculation should be: broadcast_bias? 1 : M, it's wrong here on purpose just for validation test
+                                 broadcast_bias? 1 : b_value);
+
+    // Create tensors
+    CLTensor lhs_reshaped  = create_tensor<CLTensor>(lhs_shape_reshaped, dt_input0);
+    CLTensor rhs_reshaped  = create_tensor<CLTensor>(rhs_shape_reshaped, dt_input1);
+    CLTensor bias = create_tensor<CLTensor>(bias_shape, dt_input2);
+    CLTensor dst  = create_tensor<CLTensor>(dst_shape, dt_output);
+
+    ARM_COMPUTE_EXPECT(lhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Validate zero-padding
+    CLGEMMMatrixMultiplyReshaped gemm;
+
+    gemm.configure(&lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
+
+    // Padding can be added along rhs and bias's X/Y dimension
+    return dst.info()->padding().empty() && lhs_reshaped.info()->padding().empty();
+}
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(GEMMMatrixMultiplyReshaped)
 
+/** Validate zero padding tests
+ *
+ * A series of validation tests to check the zero padding requirement
+ *
+ * Checks performed in order:
+ *     - No partial blocks in both x and y dimensions
+ *     - Partial blocks in x dimension
+ *     - Partial blocks in y dimension
+ *     - Partial blocks in both x and y dimensions
+ *     - Special case: partial_n0 == 9 (vstore1 should be invoked instead of vstore_partial_1)
+ */
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(
+framework::dataset::make("M",                   { 24, 64, 101,   1, 103 }),
+framework::dataset::make("N",                   { 48, 29,  16, 121,  41 })),
+framework::dataset::make("M0",                  {  4,  8,   4,   2,   4 })),
+framework::dataset::make("N0",                  {  4,  4,  16,   2,  16 })),
+m_value, n_value, m0_value, n0_value)
+{
+    constexpr DataType dt = DataType::F32;
+
+    bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, 1, false, false, false, 0, 0, ActivationLayerInfo(), dt, dt, dt, dt, 1.0f, 1.0f);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
@@ -224,14 +320,14 @@
 
                                 })),
                framework::dataset::make("RHSMInfo",{
-                                                          GEMMRHSMatrixInfo(4,4,1,true,true),
-                                                          GEMMRHSMatrixInfo(4,4,1, true,true),
-                                                          GEMMRHSMatrixInfo(4,4,1,true,true),
-                                                          GEMMRHSMatrixInfo(2,2,1,true,false),
-                                                          GEMMRHSMatrixInfo(2,2,1,true,false),
-                                                          GEMMRHSMatrixInfo(4,4,1,true,true),
-                                                          GEMMRHSMatrixInfo(4,4,1,true,true),
-                                                          GEMMRHSMatrixInfo(4,4,2,true,false),
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                          GEMMRHSMatrixInfo(2,2,1,true,false,false),
+                                                          GEMMRHSMatrixInfo(2,2,1,true,false,false),
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                          GEMMRHSMatrixInfo(4,4,2,true,false,false),
 
 
                            })),
@@ -248,7 +344,7 @@
                                                                      1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                                      1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
                                                                      GEMMLHSMatrixInfo(4,4,1,false,true),
-                                                                     GEMMRHSMatrixInfo(4,4,1,true,true),
+                                                                     GEMMRHSMatrixInfo(4,4,1,true,true,false),
                                                                      0  /**< Offset to be added to each element of the matrix A */,
                                                                      0 /**< Offset to be added to each element of the matrix B */),
 
@@ -262,7 +358,7 @@
                                                                      1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                                      1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
                                                                      GEMMLHSMatrixInfo(4,4,1,false,true),
-                                                                     GEMMRHSMatrixInfo(4,4,1,true,true),
+                                                                     GEMMRHSMatrixInfo(4,4,1,true,true,false),
                                                                      0  /**< Offset to be added to each element of the matrix A */,
                                                                      0 /**< Offset to be added to each element of the matrix B */),
                                                             GEMMKernelInfo(),
@@ -279,7 +375,7 @@
                                                                      1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                                      1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
                                                                      GEMMLHSMatrixInfo(4,4,1,false,true),
-                                                                     GEMMRHSMatrixInfo(4,4,1,true,true),
+                                                                     GEMMRHSMatrixInfo(4,4,1,true,true,false),
                                                                      0  /**< Offset to be added to each element of the matrix A */,
                                                                      0 /**< Offset to be added to each element of the matrix B */),
 
@@ -294,7 +390,7 @@
                                                                      1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                                      1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
                                                                      GEMMLHSMatrixInfo(4,4,1,false,true),
-                                                                     GEMMRHSMatrixInfo(4,4,1,true,true),
+                                                                     GEMMRHSMatrixInfo(4,4,1,true,true,false),
                                                                      0  /**< Offset to be added to each element of the matrix A */,
                                                                      0 /**< Offset to be added to each element of the matrix B */),
 
@@ -308,7 +404,7 @@
                                                                      1   /**< Multiplication factor for the width of the 1xW transposed block */,
                                                                      1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
                                                                      GEMMLHSMatrixInfo(4,4,1,false,true),
-                                                                     GEMMRHSMatrixInfo(4,4,2,true,false),
+                                                                     GEMMRHSMatrixInfo(4,4,2,true,false,false),
                                                                      0  /**< Offset to be added to each element of the matrix A */,
                                                                      0 /**< Offset to be added to each element of the matrix B */),
                                                     })),
@@ -327,7 +423,7 @@
 TEST_SUITE(FP32)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,
                                                                    n_values),
                                                                    k_values),
@@ -339,6 +435,7 @@
                                                                    h0_values_precommit),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                    a_values_precommit),
                                                                    beta_values_precommit),
@@ -351,7 +448,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture<float>, framework::DatasetMode::DISABLED,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,
                                                                    n_values),
                                                                    k_values),
@@ -363,6 +460,7 @@
                                                                    h0_values_nightly),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                    a_values_nightly),
                                                                    beta_values_nightly),
@@ -375,7 +473,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
@@ -388,6 +486,7 @@
                                                                    h0_values_precommit),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                    a_values_precommit),
                                                                    beta_values_precommit),
@@ -399,7 +498,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>, framework::DatasetMode::DISABLED,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
@@ -412,6 +511,7 @@
                                                                    h0_values_nightly),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                    a_values_nightly),
                                                                    beta_values_nightly),
@@ -421,12 +521,137 @@
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
-TEST_SUITE_END() // FP32
+TEST_SUITE(ExportToCLImage)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
+                                                        TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
+                                                        TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
+                                                        TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),  // Incorrect k0
+                                                        TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),  // Incorrect n0
 
-TEST_SUITE(FP16)
+                                                      }),
+               framework::dataset::make("Input1Info",{ TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(512U, 8U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(128U, 32U, 2U), 1, DataType::F32),
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                      })),
+               framework::dataset::make("Input2Info", { TensorInfo(TensorShape(64U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(64U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(64U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(64U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(64U), 1, DataType::F32),
+
+                                                      })),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 64U, 2U), 1, DataType::F32),
+
+                           })),
+               framework::dataset::make("LHSMInfo",{
+                                                          GEMMLHSMatrixInfo(4, 4, 1, false, true),
+                                                          GEMMLHSMatrixInfo(4, 8, 1, false, true),
+                                                          GEMMLHSMatrixInfo(4, 4, 1, false, true),
+                                                          GEMMLHSMatrixInfo(4, 2, 1, false, false),
+                                                          GEMMLHSMatrixInfo(4, 4, 1, false, false),
+
+                                })),
+               framework::dataset::make("RHSMInfo",{
+                                                          GEMMRHSMatrixInfo(4, 4, 1, true, true, true),
+                                                          GEMMRHSMatrixInfo(4, 8, 1, true, true, true),
+                                                          GEMMRHSMatrixInfo(8, 4, 1, true, true, true),
+                                                          GEMMRHSMatrixInfo(4, 2, 1, true, false, true),
+                                                          GEMMRHSMatrixInfo(2, 4, 1, true, false, true),
+                           })),
+               framework::dataset::make("GEMMInfo",{GEMMKernelInfo( 64 /**<M Number of LHS rows*/,
+                                                                    64 /**<N Number of RHS columns*/,
+                                                                    64 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                             false /**< reinterpret the input as 3D */,
+                                                             true  /**< Flag used to broadcast the bias addition */,
+                                                             false /**< wider accumm */,
+                                                           ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                             1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                             GEMMLHSMatrixInfo(),
+                                                             GEMMRHSMatrixInfo(),
+                                                             0  /**< Offset to be added to each element of the matrix A */,
+                                                             0 /**< Offset to be added to each element of the matrix B */),
+                                                    GEMMKernelInfo( 64 /**<M Number of LHS rows*/,
+                                                                    64 /**<N Number of RHS columns*/,
+                                                                    64 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                             false /**< reinterpret the input as 3D */,
+                                                             true  /**< Flag used to broadcast the bias addition */,
+                                                             false /**< wider accumm */,
+                                                           ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                             1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                             GEMMLHSMatrixInfo(),
+                                                             GEMMRHSMatrixInfo(),
+                                                             0  /**< Offset to be added to each element of the matrix A */,
+                                                             0 /**< Offset to be added to each element of the matrix B */),
+                                                    GEMMKernelInfo( 64 /**<M Number of LHS rows*/,
+                                                                    64 /**<N Number of RHS columns*/,
+                                                                    64 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                             false /**< reinterpret the input as 3D */,
+                                                             true  /**< Flag used to broadcast the bias addition */,
+                                                             false /**< wider accumm */,
+                                                           ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                             1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                             GEMMLHSMatrixInfo(),
+                                                             GEMMRHSMatrixInfo(),
+                                                             0  /**< Offset to be added to each element of the matrix A */,
+                                                             0 /**< Offset to be added to each element of the matrix B */),
+
+                                                    GEMMKernelInfo( 64 /**<M Number of LHS rows*/,
+                                                                    64 /**<N Number of RHS columns*/,
+                                                                    64 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                             false /**< reinterpret the input as 3D */,
+                                                             true  /**< Flag used to broadcast the bias addition */,
+                                                             false /**< wider accumm */,
+                                                           ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                             1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                             GEMMLHSMatrixInfo(),
+                                                             GEMMRHSMatrixInfo(),
+                                                             0  /**< Offset to be added to each element of the matrix A */,
+                                                             0 /**< Offset to be added to each element of the matrix B */),
+                                                    GEMMKernelInfo( 64 /**<M Number of LHS rows*/,
+                                                                    64 /**<N Number of RHS columns*/,
+                                                                    64 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                             false /**< reinterpret the input as 3D */,
+                                                             true  /**< Flag used to broadcast the bias addition */,
+                                                             false /**< wider accumm */,
+                                                           ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                             1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                             1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                             GEMMLHSMatrixInfo(),
+                                                             GEMMRHSMatrixInfo(),
+                                                             0  /**< Offset to be added to each element of the matrix A */,
+                                                             0 /**< Offset to be added to each element of the matrix B */)
+                                                    })),
+               framework::dataset::make("Expected", { true,
+                                                      true,
+                                                      true,
+                                                      false,
+                                                      false})),
+                    input0_info ,input1_info, input2_info, output_info, lhs_info, rhs_info, gemm_info, expected)
+{
+   ARM_COMPUTE_EXPECT(bool(CLGEMMMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
+                                                          &input1_info.clone()->set_is_resizable(true),
+                                                          &input2_info.clone()->set_is_resizable(true),
+                                                          &output_info.clone()->set_is_resizable(true),1.f,1.f,
+                                                          lhs_info,
+                                                          rhs_info,
+                                                          gemm_info)) == (expected && image2d_from_buffer_supported(CLKernelLibrary::get().get_device())), framework::LogLevel::ERRORS);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,
                                                                    n_values),
                                                                    k_values),
@@ -438,6 +663,144 @@
                                                                    h0_values_precommit),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   broadcast_bias_values),
+                                                                   lhs_transpose_values),
+                                                                   act_values))
+{
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture<float>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_nightly),
+                                                                   n0_export_to_cl_image_values_nightly),
+                                                                   k0_export_to_cl_image_values_nightly),
+                                                                   v0_values_nightly),
+                                                                   h0_values_nightly),
+                                                                   i_values_lhs),
+                                                                   i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values_nightly),
+                                                                   beta_values_nightly),
+                                                                   broadcast_bias_values),
+                                                                   lhs_transpose_values),
+                                                                   act_values))
+{
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_w_values,
+                                                                   m_h_values),
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   i_values_lhs),
+                                                                   i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   lhs_transpose_values),
+                                                                   act_values))
+{
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_w_values,
+                                                                   m_h_values),
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_nightly),
+                                                                   n0_export_to_cl_image_values_nightly),
+                                                                   k0_export_to_cl_image_values_nightly),
+                                                                   v0_values_nightly),
+                                                                   h0_values_nightly),
+                                                                   i_values_lhs),
+                                                                   i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values_nightly),
+                                                                   beta_values_nightly),
+                                                                   lhs_transpose_values),
+                                                                   act_values))
+{
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+TEST_SUITE_END() // ExportToCLImage
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   i_values_lhs),
+                                                                   i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values_precommit),
                                                                    beta_values_precommit),
@@ -450,7 +813,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedFixture<half>, framework::DatasetMode::DISABLED,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,
                                                                    n_values),
                                                                    k_values),
@@ -462,6 +825,7 @@
                                                                    h0_values_nightly),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values_nightly),
                                                                    beta_values_nightly),
@@ -474,7 +838,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
@@ -487,6 +851,7 @@
                                                                    h0_values_precommit),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values_precommit),
                                                                    beta_values_precommit),
@@ -498,7 +863,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>, framework::DatasetMode::DISABLED,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
@@ -511,6 +876,7 @@
                                                                    h0_values_nightly),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values_nightly),
                                                                    beta_values_nightly),
@@ -525,7 +891,7 @@
 TEST_SUITE(MixedPrecision)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,
                                                                    n_values),
                                                                    k_values),
@@ -537,6 +903,7 @@
                                                                    h0_values_precommit),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values_precommit),
                                                                    beta_values_precommit),
@@ -549,7 +916,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture<half>, framework::DatasetMode::DISABLED,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,
                                                                    n_values),
                                                                    k_values),
@@ -561,6 +928,7 @@
                                                                    h0_values_nightly),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values_nightly),
                                                                    beta_values_nightly),
@@ -573,7 +941,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionFixture<half>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
@@ -586,6 +954,7 @@
                                                                    h0_values_precommit),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values_precommit),
                                                                    beta_values_precommit),
@@ -597,7 +966,7 @@
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionFixture<half>, framework::DatasetMode::DISABLED,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
@@ -610,6 +979,7 @@
                                                                    h0_values_nightly),
                                                                    i_values_lhs),
                                                                    i_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                    a_values_nightly),
                                                                    beta_values_nightly),

diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
index b8b5860..bd0cd03 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,6 +67,9 @@
 RelativeTolerance<float> rel_tolerance_f32(0.001f);
 constexpr float          abs_tolerance_f32(0.0001f);
 
+RelativeTolerance<float> rel_tolerance_f16(0.001f);
+constexpr float          abs_tolerance_f16(0.01f);
+
 /** Alpha values to test */
 const auto a_values = framework::dataset::make("alpha", {-0.75f} );
 
@@ -98,14 +101,23 @@
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 8.f, 2.f),
 });
 
-/** M0 values to test */
-const auto m0_values = framework::dataset::make("M0", { 8 });
+/** M0 values to test - precommit */
+const auto m0_values_precommit = framework::dataset::make("M0", { 4 });
 
-/** N0 values to test */
-const auto n0_values = framework::dataset::make("N0", { 16 });
+/** N0 values to test - precommit*/
+const auto n0_values_precommit = framework::dataset::make("N0", { 4 });
 
-/** K0 values to test */
-const auto k0_values = framework::dataset::make("K0", { 16 });
+/** K0 values to test - precommit*/
+const auto k0_values_precommit = framework::dataset::make("K0", { 4 });
+
+/** M0 values to test - nightly */
+const auto m0_values_nightly = framework::dataset::make("M0", { 8 });
+
+/** N0 values to test - nightly */
+const auto n0_values_nightly = framework::dataset::make("N0", { 16 });
+
+/** K0 values to test - nightly */
+const auto k0_values_nightly = framework::dataset::make("K0", { 16 });
 
 /** H0 values to test */
 const auto h0_values = framework::dataset::make("H0", 1, 3);
@@ -119,10 +131,36 @@
 /** Broadcast bias from vector to matrix */
 const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", { false, true } );
 
+/** Boundary handling cases for testing partial/non-partial (full) block dimensions, resulting from different combinations
+ * of M, M0, N and N0 values.
+ * M0 and N0 are kept constant, while the different test cases need to vary M and N.
+ *
+ * Eg. M = 64 and N = 33 result in a block dimension that has no partial blocks (all full blocks) in Y dimension and
+ * parital blocks in X dimension.
+ */
+const auto boundary_handling_cases = combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                    // Large k to force potential out-of-bound reads on input0
+                                    framework::dataset::make("K", 315),
+                                    // Batch size == 1 to force potential out-of-bound reads on input0
+                                    framework::dataset::make("batch_size", 1)),
+                                    framework::dataset::make("M0", 4)),
+                                    framework::dataset::make("N0", 4)),
+                                    framework::dataset::make("K0", 4)),
+                                    framework::dataset::make("H0", 3)),
+                                    i_values_rhs),
+                                    t_values_rhs),
+                                    framework::dataset::make("export_to_cl_image_rhs", {true, false})),
+                                    // Only need to test F32 as F16 shares identical boundary handling logics
+                                    framework::dataset::make("DataType", DataType::F32)),
+                                    framework::dataset::make("alpha", -0.75f )),
+                                    framework::dataset::make("beta", -0.35f )),
+                                    broadcast_bias_values),
+                                    framework::dataset::make("Activation", ActivationLayerInfo()));
+
 /** Configuration test */
 bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
                             unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
-                            bool i_value_rhs, bool t_value_rhs, bool broadcast_bias, bool input_as_3d, unsigned int depth_output_gemm3d, const ActivationLayerInfo &act_info,
+                            bool i_value_rhs, bool t_value_rhs, bool export_to_cl_image, bool broadcast_bias, bool input_as_3d, unsigned int depth_output_gemm3d, const ActivationLayerInfo &act_info,
                             DataType dt_input0, DataType dt_input1, DataType dt_input2, DataType dt_output, float alpha, float beta)
 {
     const unsigned int M = m_value;
@@ -139,6 +177,7 @@
     rhs_info.h0         = h0_value;
     rhs_info.interleave = i_value_rhs;
     rhs_info.transpose  = t_value_rhs;
+    rhs_info.export_to_cl_image = export_to_cl_image;
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = M;
@@ -172,6 +211,70 @@
     CLGEMMMatrixMultiplyReshapedOnlyRHS gemm;
     return bool(gemm.validate(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info));
 }
+
+/** Zero padding test */
+bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
+                            unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
+                            bool i_value_rhs, bool t_value_rhs, bool export_to_cl_image, bool broadcast_bias, bool input_as_3d, unsigned int depth_output_gemm3d, const ActivationLayerInfo &act_info,
+                            DataType dt_input0, DataType dt_input1, DataType dt_input2, DataType dt_output, float alpha, float beta)
+{
+    const unsigned int M = m_value;
+    const unsigned int N = n_value;
+    const unsigned int K = k_value;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0         = m0_value;
+    lhs_info.k0         = k0_value;
+
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0         = n0_value;
+    rhs_info.k0         = k0_value;
+    rhs_info.h0         = h0_value;
+    rhs_info.interleave = i_value_rhs;
+    rhs_info.transpose  = t_value_rhs;
+    rhs_info.export_to_cl_image = export_to_cl_image;
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = M;
+    kernel_info.n                       = N;
+    kernel_info.k                       = K;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = act_info;
+
+    const TensorShape lhs_shape(K, M, b_value);
+    const TensorShape rhs_shape(N, K, b_value);
+    const TensorShape rhs_shape_reshaped = compute_rhs_reshaped_shape(TensorInfo(rhs_shape, 1, dt_input1),
+                                                                      rhs_info);
+
+    const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape, 1, dt_input0),
+                                                   TensorInfo(rhs_shape_reshaped, 1, dt_input1),
+                                                   kernel_info);
+
+    const TensorShape bias_shape(N,
+                                 M, // Correct calculation should be: broadcast_bias? 1 : M, it's wrong here on purpose just for validation test
+                                 broadcast_bias? 1 : b_value);
+
+    // Create tensors
+    CLTensor lhs  = create_tensor<CLTensor>(lhs_shape, dt_input0);
+    CLTensor rhs_reshaped  = create_tensor<CLTensor>(rhs_shape_reshaped, dt_input1);
+    CLTensor bias = create_tensor<CLTensor>(bias_shape, dt_input2);
+    CLTensor dst  = create_tensor<CLTensor>(dst_shape, dt_output);
+
+    ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(rhs_reshaped.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Validate zero-padding
+    CLGEMMMatrixMultiplyReshapedOnlyRHS gemm;
+
+    gemm.configure(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info);
+
+    // Padding can be added along rhs and bias's X dimension
+    return dst.info()->padding().empty() && lhs.info()->padding().empty() && bias.info()->padding().bottom == 0 && bias.info()->padding().top == 0;
+}
 } // namespace
 
 TEST_SUITE(CL)
@@ -190,42 +293,122 @@
  *     - Unsupported bias addition: bias broadcast mode is 0 if the input or output has to be reinterpreted as 3D
  *     - Incorrect bias diemension when bias broadcast mode is 1 and beta is not 0.0f, should be (n, 1), not (n, m)
  *     - Incorrect input0 dimension when input is reinterpreted as 3D: input0->dimension(1) * input0->dimension(2) != m
+ *     - Correct support for creating an OpenCL image object from buffer
+ *     - Incorrect support for creating an OpenCL image object from buffer. N0 is 2 but it can only be 4,8 and 16
+ *     - Incorrect support for creating an OpenCL image object from buffer. Data type is F16 but it can only be F32
  */
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
-framework::dataset::make("batch_size",          { 1, 1, 1, 1, 1, 1, 2 }),
-framework::dataset::make("M0",                  { 4, 9, 4, 4, 4, 4, 4 })),
-framework::dataset::make("N0",                  { 4, 4, 18, 4, 4, 4, 4 })),
-framework::dataset::make("K0",                  { 4, 4, 4, 1, 4, 4, 4 })),
-framework::dataset::make("broadcast_bias",      { false, false, false, false, false, true, true })),
-framework::dataset::make("input_as_3d",         { 0, 0, 0, 0, 1, 0, 1 })),
-framework::dataset::make("depth_output_gemm3d", { 0, 0, 0, 0, 0, 1, 0 })),
-framework::dataset::make("data_type_input0",    { DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32})),
-framework::dataset::make("data_type_input1",    { DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32})),
-framework::dataset::make("data_type_input2",    { DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32})),
-framework::dataset::make("data_type_output",    { DataType::F16, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32})),
-framework::dataset::make("Beta",                { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f })),
-framework::dataset::make("Expected",            { false, false, false, false, false, false, false })),
-b_value, m0_value, n0_value, k0_value, broadcast_bias, input_as_3d, depth_output_gemm3d, dt_input0, dt_intpu1, dt_input2, dt_output, beta, expected)
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(zip(
+framework::dataset::make("batch_size",          { 1, 1, 1, 1, 1, 1, 2, 1, 1, 1 }),
+framework::dataset::make("M0",                  { 4, 9, 4, 4, 4, 4, 4, 4, 4, 4 })),
+framework::dataset::make("N0",                  { 4, 4, 18, 4, 4, 4, 4, 8, 2, 8 })),
+framework::dataset::make("K0",                  { 4, 4, 4, 1, 4, 4, 4, 4, 4, 4 })),
+framework::dataset::make("broadcast_bias",      { false, false, false, false, false, true, true, false, false, false })),
+framework::dataset::make("input_as_3d",         { 0, 0, 0, 0, 1, 0, 1, 0, 0, 0 })),
+framework::dataset::make("depth_output_gemm3d", { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0 })),
+framework::dataset::make("export_to_cl_image",  { false, false, false, false, false, false, false, true, true, true })),
+framework::dataset::make("data_type_input0",    { DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F16})),
+framework::dataset::make("data_type_input1",    { DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F16})),
+framework::dataset::make("data_type_input2",    { DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F16})),
+framework::dataset::make("data_type_output",    { DataType::F16, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F32, DataType::F16})),
+framework::dataset::make("Beta",                { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f , 1.0f})),
+framework::dataset::make("Expected",            { false, false, false, false, false, false, false, true, false, false })),
+b_value, m0_value, n0_value, k0_value, broadcast_bias, input_as_3d, depth_output_gemm3d, export_to_cl_image, dt_input0, dt_intpu1, dt_input2, dt_output, beta, expected)
 {
-    bool status = validate_configuration(37, 51, 23, b_value, m0_value, n0_value, k0_value, 1, false, false, broadcast_bias, input_as_3d, depth_output_gemm3d, ActivationLayerInfo(), dt_input0, dt_intpu1, dt_input2, dt_output, 1.0f, beta);
-    ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
+    bool expected_value = expected;
+
+    // Change expected to false if the target platform does not support the OpenCL cl_khr_image2d_from_buffer extension
+    if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()) && export_to_cl_image)
+    {
+        expected_value = false;
+    }
+
+    bool status = validate_configuration(37, 51, 23, b_value, m0_value, n0_value, k0_value, 1, false, false, export_to_cl_image, broadcast_bias, input_as_3d, depth_output_gemm3d, ActivationLayerInfo(), dt_input0, dt_intpu1, dt_input2, dt_output, 1.0f, beta);
+    ARM_COMPUTE_EXPECT(status == expected_value, framework::LogLevel::ERRORS);
+}
+
+/** Validate zero padding tests
+ *
+ * A series of validation tests to check that no padding is added as part of configuration for 4 different scenarios.
+ *
+ * Checks performed in order:
+ *     - No partial blocks in both x and y dimensions
+ *     - Partial blocks in x dimension
+ *     - Partial blocks in y dimension
+ *     - Partial blocks in both x and y dimensions
+ *     - Special case: partial_n0 == 9 (vstore1 should be invoked instead of vstore_partial_1)
+ */
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(zip(
+framework::dataset::make("M",                   { 24, 64, 101,   1, 100 }),
+framework::dataset::make("N",                   { 48, 29,  16, 122,  41 })),
+framework::dataset::make("M0",                  {  4,  8,   7,   2,   1 })),
+framework::dataset::make("N0",                  {  4,  4,  16,   3,  16 })),
+framework::dataset::make("export_to_cl_image",  { false, true, true, false, false })),
+m_value, n_value, m0_value, n0_value, export_to_cl_image)
+{
+    constexpr DataType dt = DataType::F32;
+    // Disable export_to_cl_image if the target platform does not support the OpenCL cl_khr_image2d_from_buffer extension
+    bool actual_export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device()) && export_to_cl_image;
+
+    bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, 1, false, false, actual_export_to_cl_image, false, 0, 0, ActivationLayerInfo(), dt, dt, dt, dt, 1.0f, 1.0f);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
 }
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingPartialInXPartialInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(
+                        framework::dataset::make("M", 3),
+                        framework::dataset::make("N", 1)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingPartialInXFullInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(
+                        framework::dataset::make("M", 64),
+                        framework::dataset::make("N", 43)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingFullInXFullInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(
+                        framework::dataset::make("M", 64),
+                        framework::dataset::make("N", 32)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingFullInXPartialInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(
+                        framework::dataset::make("M", 37),
+                        framework::dataset::make("N", 32)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,
                                                                    n_values),
                                                                    k_values),
                                                                    b_values),
-                                                                   m0_values),
-                                                                   n0_values),
-                                                                   k0_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
                                                                    h0_values),
                                                                    i_values_rhs),
                                                                    t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                    a_values),
                                                                    beta_values),
@@ -236,19 +419,43 @@
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunSmall3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<float>, framework::DatasetMode::ALL,
-                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_nightly),
+                                                                   n0_values_nightly),
+                                                                   k0_values_nightly),
+                                                                   h0_values),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   broadcast_bias_values),
+                                                                   act_values))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_w_values,
                                                                    m_h_values),
                                                                    n_values),
                                                                    k_values),
                                                                    b_values),
-                                                                   m0_values),
-                                                                   n0_values),
-                                                                   k0_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
                                                                    h0_values),
                                                                    i_values_rhs),
                                                                    t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                    a_values),
                                                                    beta_values),
@@ -258,7 +465,235 @@
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
 
+FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<float>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_w_values,
+                                                                   m_h_values),
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_nightly),
+                                                                   n0_values_nightly),
+                                                                   k0_values_nightly),
+                                                                   h0_values),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   act_values))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+TEST_SUITE(ExportToCLImage)
+FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   h0_values),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   broadcast_bias_values),
+                                                                   act_values))
+{
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_nightly),
+                                                                   n0_values_nightly),
+                                                                   k0_values_nightly),
+                                                                   h0_values),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   broadcast_bias_values),
+                                                                   act_values))
+{
+    // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension
+    if(image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_w_values,
+                                                                   m_h_values),
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   h0_values),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   act_values))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<float>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_w_values,
+                                                                   m_h_values),
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_nightly),
+                                                                   n0_values_nightly),
+                                                                   k0_values_nightly),
+                                                                   h0_values),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   act_values))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<half>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   h0_values),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   broadcast_bias_values),
+                                                                   act_values))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunNightly, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<half>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_nightly),
+                                                                   n0_values_nightly),
+                                                                   k0_values_nightly),
+                                                                   h0_values),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   broadcast_bias_values),
+                                                                   act_values))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunPrecommit3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<half>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_w_values,
+                                                                   m_h_values),
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   h0_values),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   act_values))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture<half>, framework::DatasetMode::NIGHTLY,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_w_values,
+                                                                   m_h_values),
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_nightly),
+                                                                   n0_values_nightly),
+                                                                   k0_values_nightly),
+                                                                   h0_values),
+                                                                   i_values_rhs),
+                                                                   t_values_rhs),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values),
+                                                                   beta_values),
+                                                                   act_values))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE_END() // FP16
+
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // GEMMMatrixMulipltyReshapedOnlyRHS
 TEST_SUITE_END() // CL

diff --git a/tests/validation/CL/GEMMReshapeLHSMatrix.cpp b/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
index d252f87..d9439f6 100644
--- a/tests/validation/CL/GEMMReshapeLHSMatrix.cpp
+++ b/tests/validation/CL/GEMMReshapeLHSMatrix.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -81,10 +81,69 @@
 
 /** Transpose values to test */
 const auto t_values = framework::dataset::make("transpose", { true, false });
+
+/** Zero padding test */
+bool validate_zero_padding(unsigned int m_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int k0_value, unsigned int v0_value,
+                            bool i_value_lhs, bool t_value_lhs, bool input_as_3d, DataType dt)
+{
+    const unsigned int M = m_value;
+    const unsigned int K = k_value;
+    const unsigned int B = b_value;
+
+    GEMMLHSMatrixInfo lhs_info;
+    lhs_info.m0 = m0_value;
+    lhs_info.k0 = k0_value;
+    lhs_info.v0 = v0_value;
+    lhs_info.interleave = i_value_lhs;
+    lhs_info.transpose = t_value_lhs;
+
+    const TensorShape lhs_shape(K, M, B);
+    const TensorShape lhs_shape_reshaped = compute_lhs_reshaped_shape(TensorInfo(lhs_shape, 1, dt), lhs_info, input_as_3d);
+
+    // Create tensors
+    CLTensor lhs = create_tensor<CLTensor>(lhs_shape, dt);
+    CLTensor dst = create_tensor<CLTensor>(lhs_shape_reshaped, dt);
+
+    ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Validate zero-padding
+    CLGEMMReshapeLHSMatrixKernel lhs_reshape;
+
+    lhs_reshape.configure(&lhs, &dst, lhs_info, input_as_3d);
+
+    return lhs.info()->padding().empty();
+}
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(GEMMReshapeLHSMatrix)
+
+/** Validate zero padding tests for the LHS input tensor
+ *
+ * A series of validation tests to test the zero padding requirement
+ *
+ * Checks performed in order:
+ *     - Case where M and K are smaller than M0 and K0
+ *     - Generic test case with batch size = 1
+ *     - Generic test case with batch size = 4
+ *     - Generic test case with input_as_3d_value = true
+ */
+DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
+framework::dataset::make("M",                   { 1, 23, 63, 101 }),
+framework::dataset::make("K",                   { 1, 47, 29,  27 })),
+framework::dataset::make("B",                   { 1, 1, 4, 7 })),
+framework::dataset::make("M0",                  { 4, 2, 4, 8 })),
+framework::dataset::make("K0",                  { 2, 2, 4, 8 })),
+framework::dataset::make("input_as_3d",         { false, false, false, true })),
+m_value, k_value, b_value, m0_value, k0_value, input_as_3d_value)
+{
+    constexpr DataType dt = DataType::F32;
+
+    bool status = validate_zero_padding(m_value, k_value, b_value, m0_value, k0_value, 2, false, false, input_as_3d_value, dt);
+    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
+}
+
 FIXTURE_DATA_TEST_CASE(S32, CLGEMMReshapeLHSMatrixFixture<int>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(datasets::SmallGEMMReshape2DShapes(),
                                                                    b_values),
@@ -172,4 +231,4 @@
 TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/tests/validation/CL/GEMMReshapeRHSMatrix.cpp b/tests/validation/CL/GEMMReshapeRHSMatrix.cpp
index 55688cf..c7b0752 100644
--- a/tests/validation/CL/GEMMReshapeRHSMatrix.cpp
+++ b/tests/validation/CL/GEMMReshapeRHSMatrix.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -123,6 +123,49 @@
     bool has_error = bool(CLGEMMReshapeRHSMatrixKernel::validate(&input_info.clone()->set_is_resizable(false), (output_info.total_size() == 0) ? nullptr : &output_info.clone()->set_is_resizable(false), rhs_info));
     ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
 }
+
+DATA_TEST_CASE(ValidatePadding, framework::DatasetMode::ALL, combine(combine(combine(
+               framework::dataset::make("InputShape", { TensorShape(32U, 16U, 1U),
+                                                        TensorShape(32U, 16U, 2U)
+                                                     }),
+                framework::dataset::make("N0",{ 4 })),
+                framework::dataset::make("K0",{ 4, 8, 16 })),
+                framework::dataset::make("H0",{ 1, 2, 4 })),
+               input_shape, n0, k0, h0)
+{
+    CLTensor input;
+    CLTensor output;
+
+    input.info()->init(input_shape, 1, DataType::F32);
+
+    unsigned int padding = 0;
+
+    GEMMRHSMatrixInfo rhs_info;
+    rhs_info.n0 = n0;
+    rhs_info.k0 = k0;
+    rhs_info.h0 = h0;
+    rhs_info.transpose = true;
+    rhs_info.interleave = true;
+    rhs_info.export_to_cl_image = image2d_from_buffer_supported(CLKernelLibrary::get().get_device()) && (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) != 0);
+
+    if(rhs_info.export_to_cl_image)
+    {
+        TensorShape output_shape = compute_rhs_reshaped_shape(*input.info(), rhs_info);
+        constexpr unsigned int num_floats_per_pixel = 4;
+
+        const unsigned int pixel_aligment      = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
+        const unsigned int row_pitch_alignment = pixel_aligment * num_floats_per_pixel;
+        const unsigned int round_up_width      = ((output_shape[0] + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
+
+        padding = round_up_width - output_shape[0];
+    }
+
+    CLGEMMReshapeRHSMatrixKernel kernel;
+
+    kernel.configure(&input, &output, rhs_info);
+
+    ARM_COMPUTE_EXPECT((output.info()->padding().right == padding), framework::LogLevel::ERRORS);
+}
 // clang-format on
 // *INDENT-ON*
 

diff --git a/tests/validation/CL/Gather.cpp b/tests/validation/CL/Gather.cpp
index cc892a3..e7f860e 100644
--- a/tests/validation/CL/Gather.cpp
+++ b/tests/validation/CL/Gather.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Gaussian3x3.cpp b/tests/validation/CL/Gaussian3x3.cpp
index b7d0d74..10b1a47 100644
--- a/tests/validation/CL/Gaussian3x3.cpp
+++ b/tests/validation/CL/Gaussian3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Gaussian5x5.cpp b/tests/validation/CL/Gaussian5x5.cpp
index 3eca86e..a33ac4d 100644
--- a/tests/validation/CL/Gaussian5x5.cpp
+++ b/tests/validation/CL/Gaussian5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GaussianPyramid.cpp b/tests/validation/CL/GaussianPyramid.cpp
index 2a4596d..4c17cdc 100644
--- a/tests/validation/CL/GaussianPyramid.cpp
+++ b/tests/validation/CL/GaussianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GenerateProposalsLayer.cpp b/tests/validation/CL/GenerateProposalsLayer.cpp
index c5ce710..d0ceef5 100644
--- a/tests/validation/CL/GenerateProposalsLayer.cpp
+++ b/tests/validation/CL/GenerateProposalsLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/GlobalPoolingLayer.cpp b/tests/validation/CL/GlobalPoolingLayer.cpp
index bd4fb68..5328fc8 100644
--- a/tests/validation/CL/GlobalPoolingLayer.cpp
+++ b/tests/validation/CL/GlobalPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/HOGDescriptor.cpp b/tests/validation/CL/HOGDescriptor.cpp
index f5a00ba..7c014b5 100644
--- a/tests/validation/CL/HOGDescriptor.cpp
+++ b/tests/validation/CL/HOGDescriptor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/HOGDetector.cpp b/tests/validation/CL/HOGDetector.cpp
index 6c2c18c..78edf0f 100644
--- a/tests/validation/CL/HOGDetector.cpp
+++ b/tests/validation/CL/HOGDetector.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/HOGMultiDetection.cpp b/tests/validation/CL/HOGMultiDetection.cpp
index 634af41..091ff9e 100644
--- a/tests/validation/CL/HOGMultiDetection.cpp
+++ b/tests/validation/CL/HOGMultiDetection.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/HarrisCorners.cpp b/tests/validation/CL/HarrisCorners.cpp
index 8407dbd..51591bb 100644
--- a/tests/validation/CL/HarrisCorners.cpp
+++ b/tests/validation/CL/HarrisCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/HeightConcatenateLayer.cpp b/tests/validation/CL/HeightConcatenateLayer.cpp
index 78182ce..0f21c27 100644
--- a/tests/validation/CL/HeightConcatenateLayer.cpp
+++ b/tests/validation/CL/HeightConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Histogram.cpp b/tests/validation/CL/Histogram.cpp
index 318bc1e..643e4f5 100644
--- a/tests/validation/CL/Histogram.cpp
+++ b/tests/validation/CL/Histogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Im2Col.cpp b/tests/validation/CL/Im2Col.cpp
index 7ccd0c3..12b082f 100644
--- a/tests/validation/CL/Im2Col.cpp
+++ b/tests/validation/CL/Im2Col.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -122,6 +122,16 @@
         const auto status    = CLIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias);
         ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
     }
+
+    // Kernel dimensions are too big
+    {
+        const auto input     = TensorInfo(TensorShape(1U, 9U, 5U, 2U), 1, DataType::F32, DataLayout::NHWC);
+        const auto output    = TensorInfo(TensorShape(1U, 1U, 1U, 2U), 1, DataType::F32, DataLayout::NHWC);
+        const auto conv_size = Size2D(9, 9);
+        const bool has_bias  = false;
+        const auto status    = CLIm2ColKernel::validate(&input, &output, conv_size, PadStrideInfo(), has_bias);
+        ARM_COMPUTE_EXPECT(bool(status) == false, framework::LogLevel::ERRORS);
+    }
 }
 
 template <typename T>
@@ -129,6 +139,45 @@
 
 TEST_SUITE(NHWC)
 
+/** Test that there's no padding added to input or output as part of configure
+ *
+ * @note 2 elements processed per iteration
+ *
+ * Three tests will be run:
+ *  - Channels are multiple of elements processed
+ *  - Channels larger and non multiple of elements used
+ *  - Channels smaller and not multiple of elements used
+ *
+ */
+DATA_TEST_CASE(ValidateZeroPaddingNumElemsPerIterEqual2, framework::DatasetMode::ALL,
+               combine(combine(combine(combine(combine(
+                                                   framework::dataset::make("InputChannel",
+{
+    2, 9, 1,
+}),
+framework::dataset::make("DataType", { DataType::F32 })),
+framework::dataset::make("Kernel", { Size2D(3, 4) })),
+framework::dataset::make("PadStride", { PadStrideInfo(2, 1, 1, 2) })),
+framework::dataset::make("QInfo", { QuantizationInfo() })),
+framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+input_channel, data_type, conv_size, pad_stride_info, qinfo, data_layout)
+{
+    TensorShape input_shape(input_channel, 10U, 30U, 3U);
+    const bool  has_bias = false;
+
+    const auto input_info   = TensorInfo(input_shape, 1, data_type, data_layout);
+    const auto output_shape = compute_im2col_conv_shape(&input_info, conv_size, pad_stride_info, has_bias, Size2D(1U, 1U), true);
+
+    CLTensor input  = create_tensor<CLTensor>(input_shape, data_type, 1, qinfo, data_layout);
+    CLTensor output = create_tensor<CLTensor>(output_shape, data_type, 1, qinfo, data_layout);
+
+    CLIm2ColKernel im2col;
+    im2col.configure(&input, &output, conv_size, pad_stride_info, has_bias);
+
+    // Ensure there're no paddings added at all
+    const bool no_padding = input.info()->padding().empty() && output.info()->padding().empty();
+    ARM_COMPUTE_EXPECT(no_padding, framework::LogLevel::ERRORS);
+}
 /** Test special kernel used for NHWC for 3x3 kernels
  *
  * @note 2 elements processed per iteration
@@ -150,7 +199,7 @@
 }),
 framework::dataset::make("DataType", DataType::F32)),
 framework::dataset::make("Kernel", Size2D(3, 3))),
-framework::dataset::make("PadStride", PadStrideInfo(1, 2, 1, 2))),
+framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 2), PadStrideInfo(1, 1, 0, 0) })),
 framework::dataset::make("QInfo", QuantizationInfo())),
 framework::dataset::make("DataLayout", DataLayout::NHWC)),
 framework::dataset::make("Groups", 1)))
@@ -176,11 +225,41 @@
                        combine(combine(combine(combine(combine(combine(
                                                                    framework::dataset::make("InputShape",
 {
-    TensorShape(2U, 13U, 15U, 2U), TensorShape(3U, 15U, 12U, 2U), TensorShape(1U, 1U, 2U, 2U),
+    TensorShape(2U, 13U, 15U, 2U), TensorShape(3U, 15U, 12U, 2U), TensorShape(1U, 13U, 22U, 2U),
 }),
 framework::dataset::make("DataType", DataType::F32)),
 framework::dataset::make("Kernel", Size2D(9, 9))),
-framework::dataset::make("PadStride", PadStrideInfo(2, 2, 1, 2))),
+framework::dataset::make("PadStride", { PadStrideInfo(2, 2, 1, 2), PadStrideInfo(1, 1, 0, 0) })),
+framework::dataset::make("QInfo", QuantizationInfo())),
+framework::dataset::make("DataLayout", DataLayout::NHWC)),
+framework::dataset::make("Groups", 1)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+/** Test generic kernel used for NHWC
+ *
+ * @note 2 elements processed per iteration
+ *
+ * Three tests will be run:
+ *  - Channels are multiple of elements processed
+ *  - Channels larger and non multiple of elements used
+ *  - Channels smaller and not multiple of elements used
+ *
+ *  Kernel tested im2col_generic_nhwc
+ */
+FIXTURE_DATA_TEST_CASE(Generic,
+                       CLIm2ColFixture<float>,
+                       framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(
+                                                                   framework::dataset::make("InputShape",
+{
+    TensorShape(4U, 13U, 15U, 2U), TensorShape(7U, 15U, 12U, 1U), TensorShape(1U, 5U, 3U, 1U),
+}),
+framework::dataset::make("DataType", DataType::F32)),
+framework::dataset::make("Kernel", Size2D(5, 3))),
+framework::dataset::make("PadStride", { PadStrideInfo(2, 2, 1, 2), PadStrideInfo(1, 1, 0, 0) })),
 framework::dataset::make("QInfo", QuantizationInfo())),
 framework::dataset::make("DataLayout", DataLayout::NHWC)),
 framework::dataset::make("Groups", 1)))
@@ -331,7 +410,7 @@
                        CLIm2ColFixture<float>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
-                                                                   framework::dataset::make("InputShape", TensorShape(13U, 11U, 2U, 2U)),
+                                                                   framework::dataset::make("InputShape", TensorShape(13U, 11U, 5U, 2U)),
                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                framework::dataset::make("Kernel", { Size2D(3, 2), Size2D(3, 5) })),
                                                        framework::dataset::make("PadStride", PadStrideInfo(2, 1, 2, 1))),
@@ -357,7 +436,7 @@
                        CLIm2ColFixture<uint8_t>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
-                                                                   framework::dataset::make("InputShape", TensorShape(13U, 11U, 2U, 2U)),
+                                                                   framework::dataset::make("InputShape", TensorShape(13U, 11U, 11U, 2U)),
                                                                    framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                framework::dataset::make("Kernel", { Size2D(1, 1), Size2D(3, 3), Size2D(5, 5), Size2D(3, 5), Size2D(9, 9) })),
                                                        framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
@@ -383,7 +462,7 @@
                        CLIm2ColFixture<half>,
                        framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(combine(
-                                                                   framework::dataset::make("InputShape", TensorShape(13U, 11U, 2U, 2U)),
+                                                                   framework::dataset::make("InputShape", TensorShape(13U, 11U, 11U, 2U)),
                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                framework::dataset::make("Kernel", { Size2D(1, 1), Size2D(3, 3), Size2D(5, 5), Size2D(3, 5), Size2D(9, 9) })),
                                                        framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),

diff --git a/tests/validation/CL/InstanceNormalizationLayer.cpp b/tests/validation/CL/InstanceNormalizationLayer.cpp
index 06de9e5..a30e326 100644
--- a/tests/validation/CL/InstanceNormalizationLayer.cpp
+++ b/tests/validation/CL/InstanceNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/IntegralImage.cpp b/tests/validation/CL/IntegralImage.cpp
index c8fe44d..74c5a4a 100644
--- a/tests/validation/CL/IntegralImage.cpp
+++ b/tests/validation/CL/IntegralImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/L2NormalizeLayer.cpp b/tests/validation/CL/L2NormalizeLayer.cpp
index beedd81..9502df5 100644
--- a/tests/validation/CL/L2NormalizeLayer.cpp
+++ b/tests/validation/CL/L2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/LSTMLayer.cpp b/tests/validation/CL/LSTMLayer.cpp
index 69ac61d..a550613 100644
--- a/tests/validation/CL/LSTMLayer.cpp
+++ b/tests/validation/CL/LSTMLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -134,9 +134,10 @@
                input_info, input_weights_info, recurrent_weights_info, cell_bias_info, projection_bias_info, cell_state_info, output_info, scratch_info, info, expected)
 {
     LSTMParams<ITensorInfo> lstm_params_info;
-    lstm_params_info.set_peephole_params(&cell_bias_info, &cell_bias_info)
+    auto cell_bias_clone = cell_bias_info.clone();
+    lstm_params_info.set_peephole_params(cell_bias_clone.get(), cell_bias_clone.get())
                     .set_projection_params(&recurrent_weights_info, &projection_bias_info)
-                    .set_cifg_params(&input_weights_info, &recurrent_weights_info, &cell_bias_info, &cell_bias_info);
+                    .set_cifg_params(&input_weights_info, &recurrent_weights_info, cell_bias_clone.get(), cell_bias_clone.get());
 
     ARM_COMPUTE_EXPECT(bool(CLLSTMLayer::validate(&input_info.clone()->set_is_resizable(false), &input_weights_info.clone()->set_is_resizable(false), &input_weights_info.clone()->set_is_resizable(false),
                                                   &input_weights_info.clone()->set_is_resizable(false), &recurrent_weights_info.clone()->set_is_resizable(false), &recurrent_weights_info.clone()->set_is_resizable(false),

diff --git a/tests/validation/CL/LSTMLayerQuantized.cpp b/tests/validation/CL/LSTMLayerQuantized.cpp
index 686d6bc..f975bfb 100644
--- a/tests/validation/CL/LSTMLayerQuantized.cpp
+++ b/tests/validation/CL/LSTMLayerQuantized.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/LaplacianPyramid.cpp b/tests/validation/CL/LaplacianPyramid.cpp
index 960dcee..1307f78 100644
--- a/tests/validation/CL/LaplacianPyramid.cpp
+++ b/tests/validation/CL/LaplacianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/LaplacianReconstruct.cpp b/tests/validation/CL/LaplacianReconstruct.cpp
index 30079e7..c2e1fab 100644
--- a/tests/validation/CL/LaplacianReconstruct.cpp
+++ b/tests/validation/CL/LaplacianReconstruct.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/LocallyConnected.cpp b/tests/validation/CL/LocallyConnected.cpp
index d4fda68..d32487b 100644
--- a/tests/validation/CL/LocallyConnected.cpp
+++ b/tests/validation/CL/LocallyConnected.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/LogLayer.cpp b/tests/validation/CL/LogLayer.cpp
index 856c6c4..95c4f12 100644
--- a/tests/validation/CL/LogLayer.cpp
+++ b/tests/validation/CL/LogLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/LogSoftmaxLayer.cpp b/tests/validation/CL/LogSoftmaxLayer.cpp
index 148613c..15466af 100644
--- a/tests/validation/CL/LogSoftmaxLayer.cpp
+++ b/tests/validation/CL/LogSoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,17 +46,6 @@
 /** Tolerance for float operations */
 RelativeTolerance<half>  tolerance_f16(half(0.2));
 RelativeTolerance<float> tolerance_f32(0.001f);
-
-/** Tolerance for quantized operations */
-constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
-
-/** CNN data types */
-const auto CNNDataTypes = framework::dataset::make("DataType",
-{
-    DataType::QASYMM8,
-    DataType::F16,
-    DataType::F32,
-});
 } // namespace
 
 TEST_SUITE(CL)
@@ -70,7 +59,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                       framework::dataset::make("DataType", DataType::F16)),
                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                      framework::dataset::make("Axis", { 1, 2 })))
+                                                                                                      framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -78,7 +67,7 @@
 FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
                                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                          framework::dataset::make("Axis", { 1, 2 })))
+                                                                                                          framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -86,18 +75,18 @@
 FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                       framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // FP16
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 1, 2 })))
+                                                                                                       framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -105,7 +94,7 @@
 FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 1, 2 })))
+                                                                                                           framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -113,52 +102,15 @@
 FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                         framework::dataset::make("DataType", DataType::F32)),
                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                        framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-template <typename T>
-using CLLogSoftmaxLayerQuantizedFixture = SoftmaxValidationQuantizedFixture<CLTensor, CLAccessor, CLLogSoftmaxLayer, T, true>;
-
-TEST_SUITE(Quantized)
-TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLLogSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
-                                                                                                                  framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                  combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                          framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                  framework::dataset::make("Axis", { 1, 2 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, CLLogSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
-                                                                                                                      framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                      combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                              framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                      framework::dataset::make("Axis", { 1 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
-}
-FIXTURE_DATA_TEST_CASE(Run4D, CLLogSoftmaxLayerQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                   combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
-                                                                                                                           framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                   framework::dataset::make("Axis", { 1, 2, 3 })))
-{
-    // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
-}
-
-TEST_SUITE_END()
-TEST_SUITE_END()
-
-TEST_SUITE_END()
-TEST_SUITE_END()
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // LogSoftmaxLayer
+TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute

diff --git a/tests/validation/CL/Magnitude.cpp b/tests/validation/CL/Magnitude.cpp
index 15f807d..82bce34 100644
--- a/tests/validation/CL/Magnitude.cpp
+++ b/tests/validation/CL/Magnitude.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/MaxUnpoolingLayer.cpp b/tests/validation/CL/MaxUnpoolingLayer.cpp
new file mode 100644
index 0000000..6cba8b8
--- /dev/null
+++ b/tests/validation/CL/MaxUnpoolingLayer.cpp

@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/MaxUnpoolingLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(MaxUnpoolingLayer)
+
+template <typename T>
+using CLMaxUnpoolingLayerFixture = MaxUnpoolingLayerValidationFixture<CLTensor, CLAccessor, CLPoolingLayer, CLMaxUnpoolingLayer, T>;
+
+const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                                       framework::dataset::make("PadStride", { PadStrideInfo(2, 2, 0, 0), PadStrideInfo(2, 1, 0, 0) }));
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, CLMaxUnpoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                   framework::dataset::make("DataType", DataType::F32))),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
+
+                                                                                                                  ))
+{
+    printf("validate\n");
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // FP32
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, CLMaxUnpoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                  framework::dataset::make("DataType", DataType::F16))),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
+
+                                                                                                                 ))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // FP16
+
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // PoolingLayer
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/CL/MeanStdDev.cpp b/tests/validation/CL/MeanStdDev.cpp
index f1f2326..d69d8c2 100644
--- a/tests/validation/CL/MeanStdDev.cpp
+++ b/tests/validation/CL/MeanStdDev.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/MeanStdDevNormalizationLayer.cpp b/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
index 714f100..a355f9e 100644
--- a/tests/validation/CL/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/CL/MeanStdDevNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Median3x3.cpp b/tests/validation/CL/Median3x3.cpp
index 4c8b268..b61e7c0 100644
--- a/tests/validation/CL/Median3x3.cpp
+++ b/tests/validation/CL/Median3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/MinMaxLocation.cpp b/tests/validation/CL/MinMaxLocation.cpp
index ed39525..e4a1718 100644
--- a/tests/validation/CL/MinMaxLocation.cpp
+++ b/tests/validation/CL/MinMaxLocation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/NegLayer.cpp b/tests/validation/CL/NegLayer.cpp
index 3dcb0ba..690b8f4 100644
--- a/tests/validation/CL/NegLayer.cpp
+++ b/tests/validation/CL/NegLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/NonLinearFilter.cpp b/tests/validation/CL/NonLinearFilter.cpp
index 536b458..325849b 100644
--- a/tests/validation/CL/NonLinearFilter.cpp
+++ b/tests/validation/CL/NonLinearFilter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/NormalizationLayer.cpp b/tests/validation/CL/NormalizationLayer.cpp
index fdfb225..8894980 100644
--- a/tests/validation/CL/NormalizationLayer.cpp
+++ b/tests/validation/CL/NormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/NormalizePlanarYUVLayer.cpp b/tests/validation/CL/NormalizePlanarYUVLayer.cpp
index 54fff01..58c3b82 100644
--- a/tests/validation/CL/NormalizePlanarYUVLayer.cpp
+++ b/tests/validation/CL/NormalizePlanarYUVLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/OpticalFlow.cpp b/tests/validation/CL/OpticalFlow.cpp
index 006d40a..cf60038 100644
--- a/tests/validation/CL/OpticalFlow.cpp
+++ b/tests/validation/CL/OpticalFlow.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/PReluLayer.cpp b/tests/validation/CL/PReluLayer.cpp
index ce678d9..832bac2 100644
--- a/tests/validation/CL/PReluLayer.cpp
+++ b/tests/validation/CL/PReluLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/PadLayer.cpp b/tests/validation/CL/PadLayer.cpp
index 631cc70..370195b 100644
--- a/tests/validation/CL/PadLayer.cpp
+++ b/tests/validation/CL/PadLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Permute.cpp b/tests/validation/CL/Permute.cpp
index 8eb302a..ed5d18b 100644
--- a/tests/validation/CL/Permute.cpp
+++ b/tests/validation/CL/Permute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Phase.cpp b/tests/validation/CL/Phase.cpp
index 2b750ce..71ac669 100644
--- a/tests/validation/CL/Phase.cpp
+++ b/tests/validation/CL/Phase.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/PixelWiseMultiplication.cpp b/tests/validation/CL/PixelWiseMultiplication.cpp
index ea686af..70e618e 100644
--- a/tests/validation/CL/PixelWiseMultiplication.cpp
+++ b/tests/validation/CL/PixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,6 +50,10 @@
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.75f, 0.25f),
     ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.75f, 0.25f)
 });
+// Since in-place computation on CL-side hasn't been intended to be implemented, they are not tested.
+// However, this dataset is required for the shared fixture and it would make extension easier when
+// CL-side also starts supporting in-place computation.
+const auto InPlaceDataSet = framework::dataset::make("InPlace", { false });
 } //namespace
 // *INDENT-OFF*
 // clang-format off
@@ -57,13 +61,14 @@
 
 #define PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, SCALE, RP, ACT, VALIDATE) \
     FIXTURE_DATA_TEST_CASE(TEST_NAME, CLPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE,                   \
-                           combine(combine(combine(combine(combine(combine(                                                       \
+                           combine(combine(combine(combine(combine(combine(combine(                                                       \
                            datasets::SHAPES,                                                                              \
                            framework::dataset::make("DataType1", DataType::DT1)),                                         \
                            framework::dataset::make("DataType2", DataType::DT2)),                                         \
                            framework::dataset::make("Scale", std::move(SCALE))),                                          \
                            datasets::ConvertPolicies()),                                                                  \
-                           framework::dataset::make("RoundingPolicy", RoundingPolicy::RP)), ACT))  \
+                           framework::dataset::make("RoundingPolicy", RoundingPolicy::RP)), ACT), \
+                           InPlaceDataSet))  \
     {                                                                                                                     \
         VALIDATE                                                                                                          \
     }
@@ -143,16 +148,17 @@
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                               framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                               framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                       framework::dataset::make("Scale", { 1.f, 2.f })),
-                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                       framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                               framework::dataset::make("OUtQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                                               framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                                                       framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                                               framework::dataset::make("Scale", { 1.f, 2.f })),
+                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                               framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("OUtQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -161,16 +167,17 @@
 
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQuantizedFixture<int8_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                               framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
-                                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
-                                                                               framework::dataset::make("DataTypeOut", DataType::QASYMM8_SIGNED)),
-                                                                       framework::dataset::make("Scale", { 1.f, 2.f })),
-                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                       framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
-                               framework::dataset::make("OUtQInfo", { QuantizationInfo(1.f / 255.f, 5) })))
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
+                                                                                               framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
+                                                                                       framework::dataset::make("DataTypeOut", DataType::QASYMM8_SIGNED)),
+                                                                               framework::dataset::make("Scale", { 1.f, 2.f })),
+                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                               framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(5.f / 255.f, 20) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                                       framework::dataset::make("OUtQInfo", { QuantizationInfo(1.f / 255.f, 5) })),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -179,31 +186,33 @@
 
 TEST_SUITE(QSYMM16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQuantizedFixture<int16_t>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                               framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                               framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                       framework::dataset::make("Scale", { 1.f, 2.f })),
-                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                       framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })))
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                               framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                       framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
+                                                                               framework::dataset::make("Scale", { 1.f, 2.f })),
+                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                               framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qsymm16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLPixelWiseMultiplicationQuantizedFixture<int16_t>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                               framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                               framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                       framework::dataset::make("Scale", { 1.f, 2.f })),
-                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                       framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })))
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
+                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                               framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                       framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
+                                                                               framework::dataset::make("Scale", { 1.f, 2.f })),
+                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                               framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(5.f / 32768.f, 0) })),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qsymm16);
@@ -211,16 +220,17 @@
 TEST_SUITE_END() // QSYMM16
 TEST_SUITE(QSYMM16ToS32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLPixelWiseMultiplicationQSYMM16ToS32Fxture, framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                               framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                               framework::dataset::make("DataTypeOut", DataType::S32)),
-                                                                       framework::dataset::make("Scale", { 1.f })),
-                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                       framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
-                                               framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0) })),
-                                       framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })),
-                               framework::dataset::make("OutQInfo", { QuantizationInfo(1.f, 0) })))
+                       combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                               framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                       framework::dataset::make("DataTypeOut", DataType::S32)),
+                                                                               framework::dataset::make("Scale", { 1.f })),
+                                                                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                                                               framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_EVEN)),
+                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(1.f / 32768.f, 0) })),
+                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(2.f / 32768.f, 0) })),
+                                       framework::dataset::make("OutQInfo", { QuantizationInfo(1.f, 0) })),
+                               InPlaceDataSet))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qsymm16);

diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index 611c01b..eefad4a 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,6 +71,12 @@
 framework::dataset::make("PadStride", { PadStrideInfo(1, 2, 1, 1) })),
 framework::dataset::make("ExcludePadding", { true }));
 
+const auto PoolingLayerDatasetFPIndicesSmall = combine(combine(combine(framework::dataset::make("PoolingType",
+{ PoolingType::MAX }),
+framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 2, 0, 0) })),
+framework::dataset::make("ExcludePadding", { true, false }));
+
 constexpr AbsoluteTolerance<float>   tolerance_f32(0.001f);  /**< Tolerance value for comparing reference's output against implementation's output for 32-bit floating-point type */
 constexpr AbsoluteTolerance<float>   tolerance_f16(0.01f);   /**< Tolerance value for comparing reference's output against implementation's output for 16-bit floating-point type */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);   /**< Tolerance value for comparing reference's output against implementation's output for 8-bit asymmetric type */
@@ -134,6 +140,9 @@
 template <typename T>
 using CLMixedPrecesionPoolingLayerFixture = PoolingLayerValidationMixedPrecisionFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
 
+template <typename T>
+using CLPoolingLayerIndicesFixture = PoolingLayerIndicesValidationFixture<CLTensor, CLAccessor, CLPoolingLayer, T>;
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSpecial, CLSpecialPoolingLayerFixture<float>, framework::DatasetMode::ALL, datasets::PoolingLayerDatasetSpecial() * framework::dataset::make("DataType", DataType::F32))
@@ -157,6 +166,17 @@
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallIndices, CLPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPIndicesSmall,
+                                                                                                                        framework::dataset::make("DataType",
+                                                                                                                                DataType::F32))),
+                                                                                                                        pool_data_layout_dataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+    validate(CLAccessor(_target_indices), _ref_indices);
+}
+
 TEST_SUITE_END() // FP32
 
 TEST_SUITE(FP16)
@@ -176,6 +196,15 @@
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallIndices, CLPoolingLayerIndicesFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerDatasetFPIndicesSmall,
+                                                                                                                       framework::dataset::make("DataType",
+                                                                                                                               DataType::F16))),
+                                                                                                                       pool_data_layout_dataset))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_f32);
+    validate(CLAccessor(_target_indices), _ref_indices);
+}
 TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 

diff --git a/tests/validation/CL/PriorBoxLayer.cpp b/tests/validation/CL/PriorBoxLayer.cpp
index 79776b5..c63b093 100644
--- a/tests/validation/CL/PriorBoxLayer.cpp
+++ b/tests/validation/CL/PriorBoxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/QLSTMLayerNormalization.cpp b/tests/validation/CL/QLSTMLayerNormalization.cpp
index 17f431c..a927be1 100644
--- a/tests/validation/CL/QLSTMLayerNormalization.cpp
+++ b/tests/validation/CL/QLSTMLayerNormalization.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/QuantizationLayer.cpp b/tests/validation/CL/QuantizationLayer.cpp
index e3f47f9..0953688 100644
--- a/tests/validation/CL/QuantizationLayer.cpp
+++ b/tests/validation/CL/QuantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/RNNLayer.cpp b/tests/validation/CL/RNNLayer.cpp
index 5ae38fb..4e67868 100644
--- a/tests/validation/CL/RNNLayer.cpp
+++ b/tests/validation/CL/RNNLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ROIAlignLayer.cpp b/tests/validation/CL/ROIAlignLayer.cpp
index 2a3e03f..fa18a16 100644
--- a/tests/validation/CL/ROIAlignLayer.cpp
+++ b/tests/validation/CL/ROIAlignLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Range.cpp b/tests/validation/CL/Range.cpp
index b1804f2..bf81f55 100644
--- a/tests/validation/CL/Range.cpp
+++ b/tests/validation/CL/Range.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ReduceMean.cpp b/tests/validation/CL/ReduceMean.cpp
index 55594ed..cb1e38e 100644
--- a/tests/validation/CL/ReduceMean.cpp
+++ b/tests/validation/CL/ReduceMean.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ReductionOperation.cpp b/tests/validation/CL/ReductionOperation.cpp
index b013af1..31c5a97 100644
--- a/tests/validation/CL/ReductionOperation.cpp
+++ b/tests/validation/CL/ReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Remap.cpp b/tests/validation/CL/Remap.cpp
index a76df2d..d849d6c 100644
--- a/tests/validation/CL/Remap.cpp
+++ b/tests/validation/CL/Remap.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ReorgLayer.cpp b/tests/validation/CL/ReorgLayer.cpp
index 3d03740..339b368 100644
--- a/tests/validation/CL/ReorgLayer.cpp
+++ b/tests/validation/CL/ReorgLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/ReshapeLayer.cpp b/tests/validation/CL/ReshapeLayer.cpp
index 0c27084..0b9136e 100644
--- a/tests/validation/CL/ReshapeLayer.cpp
+++ b/tests/validation/CL/ReshapeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Reverse.cpp b/tests/validation/CL/Reverse.cpp
index d18c490..ed2c6e3 100644
--- a/tests/validation/CL/Reverse.cpp
+++ b/tests/validation/CL/Reverse.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/RoundLayer.cpp b/tests/validation/CL/RoundLayer.cpp
index f0dfe8e..5aa9ca6 100644
--- a/tests/validation/CL/RoundLayer.cpp
+++ b/tests/validation/CL/RoundLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/RsqrtLayer.cpp b/tests/validation/CL/RsqrtLayer.cpp
index 82fbed3..29c113b 100644
--- a/tests/validation/CL/RsqrtLayer.cpp
+++ b/tests/validation/CL/RsqrtLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Scale.cpp b/tests/validation/CL/Scale.cpp
index 848656a..523b49d 100644
--- a/tests/validation/CL/Scale.cpp
+++ b/tests/validation/CL/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,12 +28,9 @@
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/SamplingPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/ScaleValidationDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ScaleFixture.h"
@@ -46,6 +43,23 @@
 {
 namespace
 {
+using datasets::ScaleShapesBaseDataSet;
+using datasets::ScaleInterpolationPolicySet;
+using datasets::ScaleDataLayouts;
+using datasets::ScaleSamplingPolicySet;
+using datasets::ScaleAlignCornersSamplingPolicySet;
+
+/** We consider vector size in byte 16 since the maximum size of
+ * a vector used by @ref CLScaleKernel is currently 16-byte (float4).
+ */
+constexpr uint32_t vector_byte = 16;
+
+template <typename T>
+constexpr uint32_t num_elements_per_vector()
+{
+    return vector_byte / sizeof(T);
+}
+
 /** CNN data types */
 const auto ScaleDataTypes = framework::dataset::make("DataType",
 {
@@ -55,11 +69,10 @@
     DataType::F32,
 });
 
-/** Align corners, this functionality is supported only by NEON */
-const auto AlignCorners = framework::dataset::make("AlignCorners",
+/** Quantization information data set */
+const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
 {
-    false,
-    true,
+    QuantizationInfo(0.5f, -1),
 });
 
 /** Tolerance */
@@ -76,110 +89,132 @@
 
 TEST_SUITE(CL)
 TEST_SUITE(Scale)
+TEST_SUITE(Validate)
 
-// *INDENT-OFF*
-// clang-format off
+const auto default_input_shape  = TensorShape{ 2, 3, 3, 2 };
+const auto default_output_shape = TensorShape{ 4, 6, 3, 2 };
 
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-                framework::dataset::make("InputInfo",{ TensorInfo(TensorShape(28U, 32U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(28U, 32U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(36U, 36U, 2U, 4U), 1, DataType::U8),
-                                                       TensorInfo(TensorShape(40U, 35U, 2U, 4U), 1, DataType::S16),
-                                                       TensorInfo(TensorShape(37U, 37U, 2U), 1, DataType::F32),          // window shrink
-                                                       TensorInfo(TensorShape(37U, 37U, 3U, 4U), 1, DataType::F32),      // mismatching datatype
-                                                       TensorInfo(TensorShape(28U, 33U, 2U), 1, DataType::F32),          // policy area, scale factor not correct
-                                                    }),
-                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 68U, 2U), 1, DataType::F16),
-                                                        TensorInfo(TensorShape(40U, 56U, 2U), 1, DataType::F32),
-                                                        TensorInfo(TensorShape(40U, 76U, 2U, 4U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(28U, 32U, 2U, 4U), 1, DataType::S16),
-                                                        TensorInfo(TensorShape(39U, 55U, 2U), 1, DataType::F32),           // window shrink
-                                                        TensorInfo(TensorShape(39U, 77U, 3U, 4U), 1, DataType::F16),       // mismatching datatype
-                                                        TensorInfo(TensorShape(26U, 21U, 2U), 1, DataType::F32),           // policy area, scale factor not correct
-                })),
-                framework::dataset::make("Policy",{ InterpolationPolicy::BILINEAR,
-                                                    InterpolationPolicy::BILINEAR,
-                                                    InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                    InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                    InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                    InterpolationPolicy::BILINEAR,
-                                                    InterpolationPolicy::AREA,
-                })),
-                framework::dataset::make("BorderMode",{ BorderMode::UNDEFINED,
-                                                        BorderMode::UNDEFINED,
-                                                        BorderMode::UNDEFINED,
-                                                        BorderMode::UNDEFINED,
-                                                        BorderMode::UNDEFINED,
-                                                        BorderMode::UNDEFINED,
-                                                        BorderMode::UNDEFINED,
-                })),
-                framework::dataset::make("Expected", { true, true, true, true, false, false, false })),
-input_info, output_info, policy, border_mode, expected)
+constexpr auto default_data_type            = DataType::U8;
+constexpr auto default_data_layout          = DataLayout::NHWC;
+constexpr auto default_interpolation_policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+constexpr auto default_border_mode          = BorderMode::UNDEFINED;
+constexpr bool default_use_padding          = false;
+
+TEST_CASE(NullPtr, framework::DatasetMode::ALL)
 {
-    Status status = CLScale::validate(&input_info.clone()->set_is_resizable(false),
-                                      &output_info.clone()->set_is_resizable(false), policy, border_mode);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
+    const auto input  = TensorInfo{ default_input_shape, 1, default_data_type, default_data_layout };
+    const auto output = TensorInfo{ default_output_shape, 1, default_data_type, default_data_layout };
+    Status     result{};
+
+    // nullptr is given as input
+    result = CLScale::validate(nullptr, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+
+    // nullptr is given as output
+    result = CLScale::validate(&input, nullptr, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
-// clang-format on
-// *INDENT-ON*
-
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::MediumShapes(), ScaleDataTypes),
-                                                                                   framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                           datasets::BorderModes()),
-                                                                   datasets::SamplingPolicies()),
-               shape, data_type, policy, border_mode, sampling_policy)
+TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
 {
-    std::mt19937                           generator(library->seed());
-    std::uniform_real_distribution<float>  distribution_float(0.25, 2);
-    const float                            scale_x = distribution_float(generator);
-    const float                            scale_y = distribution_float(generator);
-    std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-    uint8_t                                constant_border_value = distribution_u8(generator);
+    const std::map<DataType, bool> supported_data_types =
+    {
+        { DataType::U8, true },
+        { DataType::S8, false },
+        { DataType::QSYMM8, false },
+        { DataType::QASYMM8, true },
+        { DataType::QASYMM8_SIGNED, true },
+        { DataType::QSYMM8_PER_CHANNEL, false },
+        { DataType::U16, false },
+        { DataType::S16, true },
+        { DataType::QSYMM16, false },
+        { DataType::QASYMM16, false },
+        { DataType::U32, false },
+        { DataType::S32, false },
+        { DataType::U64, false },
+        { DataType::S64, false },
+        { DataType::BFLOAT16, false },
+        { DataType::F16, true },
+        { DataType::F32, true },
+        { DataType::F64, false },
+        { DataType::SIZET, false },
+    };
+    Status result{};
+    for(auto &kv : supported_data_types)
+    {
+        const auto input  = TensorInfo{ default_input_shape, 1, kv.first, default_data_layout };
+        const auto output = TensorInfo{ default_output_shape, 1, kv.first, default_data_layout };
 
-    // Create tensors
-    CLTensor    src = create_tensor<CLTensor>(shape, data_type);
-    TensorShape shape_scaled(shape);
-    shape_scaled.set(0, shape[0] * scale_x);
-    shape_scaled.set(1, shape[1] * scale_y);
-    CLTensor dst = create_tensor<CLTensor>(shape_scaled, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLScale clscale;
-    clscale.configure(&src, &dst, policy, border_mode, constant_border_value, sampling_policy);
-
-    // Get border size depending on border mode
-    const BorderSize border_size(border_mode == BorderMode::UNDEFINED ? 0 : 1);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = calculate_valid_region_scale(*(src.info()), shape_scaled, policy, sampling_policy, (border_mode == BorderMode::UNDEFINED));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    PaddingCalculator calculator(shape_scaled.x(), 4);
-    calculator.set_border_mode(border_mode);
-
-    const PaddingSize read_padding(border_size);
-    const PaddingSize write_padding = calculator.required_padding(PaddingCalculator::Option::EXCLUDE_BORDER);
-    validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
+        result = CLScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+        ARM_COMPUTE_EXPECT(bool(result) == kv.second, framework::LogLevel::ERRORS);
+    }
 }
 
+TEST_CASE(SameInputOutput, framework::DatasetMode::ALL)
+{
+    const auto input = TensorInfo{ default_input_shape, 1, default_data_type, default_data_layout };
+    Status     result{};
+
+    result = CLScale::validate(&input, &input, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(MissmatchingDataType, framework::DatasetMode::ALL)
+{
+    constexpr auto non_default_data_type = DataType::F32;
+
+    const auto input  = TensorInfo{ default_input_shape, 1, default_data_type, default_data_layout };
+    const auto output = TensorInfo{ default_output_shape, 1, non_default_data_type, default_data_layout };
+    Status     result{};
+
+    result = CLScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL)
+{
+    // Aligned corners require sampling policy to be TOP_LEFT.
+    constexpr auto interpolation_policy = InterpolationPolicy::BILINEAR;
+    constexpr bool align_corners        = true;
+    constexpr auto sampling_policy      = SamplingPolicy::CENTER;
+
+    const auto input  = TensorInfo{ default_input_shape, 1, default_data_type, default_data_layout };
+    const auto output = TensorInfo{ default_output_shape, 1, default_data_type, default_data_layout };
+    Status     result{};
+
+    result = CLScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), sampling_policy, default_use_padding, align_corners });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(WindowShrink, framework::DatasetMode::ALL)
+{
+    const auto input  = TensorInfo{ TensorShape(37U, 37U, 2U), 1, DataType::F32 };
+    const auto output = TensorInfo{ TensorShape(39U, 55U, 2U), 1, DataType::F32 };
+    Status     result{};
+
+    result = CLScale::validate(&input.clone()->set_is_resizable(false), &output.clone()->set_is_resizable(false), ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(IncorrectScaleFactor, framework::DatasetMode::ALL)
+{
+    const auto     input                = TensorInfo{ TensorShape(28U, 33U, 2U), 1, DataType::F32 };
+    const auto     output               = TensorInfo{ TensorShape(26U, 21U, 2U), 1, DataType::F32 };
+    constexpr auto interpolation_policy = InterpolationPolicy::AREA;
+    Status         result{};
+
+    result = CLScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Validate
+
 template <typename T>
 using CLScaleFixture = ScaleValidationFixture<CLTensor, CLAccessor, CLScale, T>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(datasets::Tiny4DShapes(), framework::dataset::make("DataType",
-                                                                                                                     DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                             datasets::BorderModes()),
-                                                                                                     datasets::SamplingPolicies()),
-                                                                                             AlignCorners))
+const auto f32_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
+FIXTURE_DATA_TEST_CASE(Run, CLScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -188,13 +223,26 @@
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::F32)),
-                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                 framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         datasets::SamplingPolicies()),
-                                                                                                 AlignCorners))
+FIXTURE_DATA_TEST_CASE(RunAlignCorners, CLScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
+const auto f32_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
+FIXTURE_DATA_TEST_CASE(RunNightly, CLScaleFixture<float>, framework::DatasetMode::NIGHTLY, ASSEMBLE_DATASET(f32_nightly_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32, tolerance_f32_absolute);
+}
+FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners, CLScaleFixture<float>, framework::DatasetMode::NIGHTLY, ASSEMBLE_DATASET(f32_nightly_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -205,13 +253,8 @@
 }
 TEST_SUITE_END() // FP32
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(datasets::Tiny4DShapes(), framework::dataset::make("DataType",
-                                                                                                                    DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                            datasets::BorderModes()),
-                                                                                                    datasets::SamplingPolicies()),
-                                                                                            AlignCorners))
+const auto f16_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
+FIXTURE_DATA_TEST_CASE(Run, CLScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -220,13 +263,26 @@
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                        DataType::F16)),
-                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                        framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                datasets::BorderModes()),
-                                                                                                        datasets::SamplingPolicies()),
-                                                                                                AlignCorners))
+FIXTURE_DATA_TEST_CASE(RunAlignCorners, CLScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
+}
+const auto f16_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
+FIXTURE_DATA_TEST_CASE(RunNightly, CLScaleFixture<half>, framework::DatasetMode::NIGHTLY, ASSEMBLE_DATASET(f16_nightly_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_f16);
+}
+FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners, CLScaleFixture<half>, framework::DatasetMode::NIGHTLY, ASSEMBLE_DATASET(f16_nightly_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -240,13 +296,8 @@
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(datasets::Tiny4DShapes(), framework::dataset::make("DataType",
-                                                                                                                       DataType::U8)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                               datasets::BorderModes()),
-                                                                                                       datasets::SamplingPolicies()),
-                                                                                               AlignCorners))
+const auto u8_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<uint8_t>())), framework::dataset::make("DataType", DataType::U8));
+FIXTURE_DATA_TEST_CASE(Run, CLScaleFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(u8_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -255,13 +306,26 @@
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                   framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                   datasets::BorderModes()),
-                                                                                                           datasets::SamplingPolicies()),
-                                                                                                   AlignCorners))
+FIXTURE_DATA_TEST_CASE(RunAlignCorners, CLScaleFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(u8_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
+}
+const auto u8_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<uint8_t>())), framework::dataset::make("DataType", DataType::U8));
+FIXTURE_DATA_TEST_CASE(RunNightly, CLScaleFixture<uint8_t>, framework::DatasetMode::NIGHTLY, ASSEMBLE_DATASET(u8_nightly_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
+}
+FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners, CLScaleFixture<uint8_t>, framework::DatasetMode::NIGHTLY, ASSEMBLE_DATASET(u8_nightly_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -272,13 +336,8 @@
 }
 TEST_SUITE_END() // U8
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(datasets::Tiny4DShapes(), framework::dataset::make("DataType",
-                                                                                                                       DataType::S16)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                               datasets::BorderModes()),
-                                                                                                       datasets::SamplingPolicies()),
-                                                                                               AlignCorners))
+const auto s16_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<int16_t>())), framework::dataset::make("DataType", DataType::S16));
+FIXTURE_DATA_TEST_CASE(Run, CLScaleFixture<int16_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -287,13 +346,26 @@
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_s16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                   DataType::S16)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                   framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                   datasets::BorderModes()),
-                                                                                                           datasets::SamplingPolicies()),
-                                                                                                   AlignCorners))
+FIXTURE_DATA_TEST_CASE(RunAlignCorners, CLScaleFixture<int16_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape, ScaleAlignCornersSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_s16);
+}
+const auto s16_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<int16_t>())), framework::dataset::make("DataType", DataType::S16));
+FIXTURE_DATA_TEST_CASE(RunNightly, CLScaleFixture<int16_t>, framework::DatasetMode::NIGHTLY, ASSEMBLE_DATASET(s16_nightly_shape, ScaleSamplingPolicySet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_s16);
+}
+FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners, CLScaleFixture<int16_t>, framework::DatasetMode::NIGHTLY, ASSEMBLE_DATASET(s16_nightly_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -309,15 +381,8 @@
 using CLScaleQuantizedFixture = ScaleValidationQuantizedFixture<CLTensor, CLAccessor, CLScale, T>;
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::Tiny4DShapes(),
-                                                                                                                        framework::dataset::make("DataType",
-                                                                                                                                DataType::QASYMM8)),
-                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -1) })),
-                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                        framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                        datasets::BorderModes()),
-                                                                                                                datasets::SamplingPolicies()),
-                                                                                                        AlignCorners))
+const auto qasymm8_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<uint8_t>())), framework::dataset::make("DataType", DataType::QASYMM8));
+FIXTURE_DATA_TEST_CASE(Run, CLScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet, QuantizationInfoSet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -326,15 +391,28 @@
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                    framework::dataset::make("DataType",
-                                                                                                                            DataType::QASYMM8)),
-                                                                                                                    framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -1) })),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                    datasets::BorderModes()),
-                                                                                                                    datasets::SamplingPolicies()),
-                                                                                                            AlignCorners))
+FIXTURE_DATA_TEST_CASE(RunAlignCorners, CLScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleAlignCornersSamplingPolicySet,
+                       QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
+}
+const auto qasymm8_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<uint8_t>())), framework::dataset::make("DataType", DataType::QASYMM8));
+FIXTURE_DATA_TEST_CASE(RunNightly, CLScaleQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, ASSEMBLE_QUANTIZED_DATASET(qasymm8_nightly_shape, ScaleSamplingPolicySet, QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_q8);
+}
+FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners, CLScaleQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY, ASSEMBLE_QUANTIZED_DATASET(qasymm8_nightly_shape, ScaleAlignCornersSamplingPolicySet,
+                       QuantizationInfoSet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -345,14 +423,8 @@
 }
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::Tiny4DShapes(),
-                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -1) })),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                       datasets::BorderModes()),
-                                                                                                               datasets::SamplingPolicies()),
-                                                                                                       AlignCorners))
+const auto qasymm8_signed_shape = combine((SCALE_PRECOMMIT_SHAPE_DATASET(num_elements_per_vector<int8_t>())), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
+FIXTURE_DATA_TEST_CASE(Run, CLScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleSamplingPolicySet, QuantizationInfoSet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -361,14 +433,30 @@
     // Validate output
     validate(CLAccessor(_target), _reference, valid_region, tolerance_qs8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLScaleQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                   framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -1) })),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                   framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                   datasets::BorderModes()),
-                                                                                                                   datasets::SamplingPolicies()),
-                                                                                                           AlignCorners))
+FIXTURE_DATA_TEST_CASE(RunAlignCorners, CLScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleAlignCornersSamplingPolicySet,
+                       QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_qs8);
+}
+const auto qasymm8_signed_nightly_shape = combine((SCALE_NIGHTLY_SHAPE_DATASET(num_elements_per_vector<int8_t>())), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
+FIXTURE_DATA_TEST_CASE(RunNightly, CLScaleQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_nightly_shape, ScaleSamplingPolicySet,
+                       QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo        src_info(_shape, 1, _data_type);
+    const ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(CLAccessor(_target), _reference, valid_region, tolerance_qs8);
+}
+FIXTURE_DATA_TEST_CASE(RunNightlyAlignCorners, CLScaleQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_nightly_shape,
+                       ScaleAlignCornersSamplingPolicySet,
+                       QuantizationInfoSet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);

diff --git a/tests/validation/CL/Scharr.cpp b/tests/validation/CL/Scharr.cpp
index cd3073f..fa6b48f 100644
--- a/tests/validation/CL/Scharr.cpp
+++ b/tests/validation/CL/Scharr.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Select.cpp b/tests/validation/CL/Select.cpp
index f366ce7..13a8cf4 100644
--- a/tests/validation/CL/Select.cpp
+++ b/tests/validation/CL/Select.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/SinLayer.cpp b/tests/validation/CL/SinLayer.cpp
index 5a75f7f..e40c990 100644
--- a/tests/validation/CL/SinLayer.cpp
+++ b/tests/validation/CL/SinLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Slice.cpp b/tests/validation/CL/Slice.cpp
index 91c2e90..50b880e 100644
--- a/tests/validation/CL/Slice.cpp
+++ b/tests/validation/CL/Slice.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Sobel.cpp b/tests/validation/CL/Sobel.cpp
index 29608ef..3670003 100644
--- a/tests/validation/CL/Sobel.cpp
+++ b/tests/validation/CL/Sobel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/SoftmaxLayer.cpp b/tests/validation/CL/SoftmaxLayer.cpp
index 5ee929f..90c3058 100644
--- a/tests/validation/CL/SoftmaxLayer.cpp
+++ b/tests/validation/CL/SoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,14 +51,6 @@
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
 constexpr AbsoluteTolerance<int8_t>  tolerance_qasymm8_signed(1);
 
-/*
- The following tolerance number is used as a workaround for the mismatches
- caused by float computation in reference (and NEON) kernel
- and integer computations in OpenCL kernel.
- COMPMID-2958 is created to investigate this.
-*/
-constexpr float tolerance_number_qasymm8_signed = 0.05f;
-
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
 {
@@ -109,7 +101,7 @@
 
 // *INDENT-OFF*
 // clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
                                                        TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
                                                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
@@ -120,6 +112,10 @@
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 12)),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED,
+                                                                  QuantizationInfo(1.f/256, 12)),
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED, // Invalid axis high
+                                                                  QuantizationInfo(1.f/256, 12)),
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED, // Invalid axis low
                                                                   QuantizationInfo(1.f/256, 12))
                                                       }),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
@@ -133,11 +129,38 @@
                                                                   QuantizationInfo(1.f/256, 0)),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED,
                                                                   QuantizationInfo(1.f/256, -128)),
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED, // Invalid axis high
+                                                                  QuantizationInfo(1.f/256, -128)),
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8_SIGNED, // Invalid axis low
+                                                                  QuantizationInfo(1.f/256, -128)),
                                                      })),
-               framework::dataset::make("Expected", { false, false, false, false, false, true, true, true })),
-               input_info, output_info, expected)
+               framework::dataset::make("beta", { 1.0,
+                                                  2.0,
+                                                  1.0,
+                                                  2.0,
+                                                  1.0,
+                                                  2.0,
+                                                  1.0,
+                                                  2.0,
+                                                  1.0,
+                                                  2.0,
+                                                })),
+               framework::dataset::make("axis", {
+                                                  0,
+                                                  0,
+                                                  0,
+                                                  0,
+                                                  0,
+                                                  0,
+                                                  0,
+                                                  0,
+                                                  2,
+                                                  -1,
+                                                })),
+               framework::dataset::make("Expected", { false, false, false, false, false, true, true, true, false, false })),
+               input_info, output_info, beta, axis, expected)
 {
-    ARM_COMPUTE_EXPECT(bool(CLSoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false))) == expected, framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bool(CLSoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
 // *INDENT-ON*
@@ -150,7 +173,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                   framework::dataset::make("Axis", { 1, 2 })))
+                                                                                                   framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -158,7 +181,7 @@
 FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 1, 2 })))
+                                                                                                       framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -166,7 +189,7 @@
 FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                    framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                    framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
@@ -177,7 +200,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall, CLSoftmaxLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                    framework::dataset::make("Axis", { 1, 2 })))
+                                                                                                    framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -185,7 +208,7 @@
 FIXTURE_DATA_TEST_CASE(RunLarge, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                         framework::dataset::make("DataType", DataType::F32)),
                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 1, 2 })))
+                                                                                                        framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -193,7 +216,7 @@
 FIXTURE_DATA_TEST_CASE(Run4D, CLSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayer4DShapes(),
                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
                                                                                                              framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                     framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                     framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
@@ -210,7 +233,7 @@
                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                        combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                               framework::dataset::make("Axis", { 1, 2 })))
+                                                                                                               framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -219,7 +242,7 @@
                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                    combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                   framework::dataset::make("Axis", { 1 })))
+                                                                                                                   framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -228,7 +251,7 @@
                                                                                                                         framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                         combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                                framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
@@ -242,10 +265,10 @@
                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                       combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                              framework::dataset::make("Axis", { 1, 2 })))
+                                                                                                              framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
-    validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed, tolerance_number_qasymm8_signed);
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8_signed);
 }
 
 TEST_SUITE_END() // QASYMM8_SIGNED

diff --git a/tests/validation/CL/SpaceToBatchLayer.cpp b/tests/validation/CL/SpaceToBatchLayer.cpp
index 3ddbcd8..b233939 100644
--- a/tests/validation/CL/SpaceToBatchLayer.cpp
+++ b/tests/validation/CL/SpaceToBatchLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/SpaceToDepthLayer.cpp b/tests/validation/CL/SpaceToDepthLayer.cpp
index 84194d7..25b4bcd 100644
--- a/tests/validation/CL/SpaceToDepthLayer.cpp
+++ b/tests/validation/CL/SpaceToDepthLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Split.cpp b/tests/validation/CL/Split.cpp
index 8f55b42..99110ff 100644
--- a/tests/validation/CL/Split.cpp
+++ b/tests/validation/CL/Split.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/StackLayer.cpp b/tests/validation/CL/StackLayer.cpp
index 3f17aa1..6f38c55 100644
--- a/tests/validation/CL/StackLayer.cpp
+++ b/tests/validation/CL/StackLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/StridedSlice.cpp b/tests/validation/CL/StridedSlice.cpp
index 00eee16..9bfad55 100644
--- a/tests/validation/CL/StridedSlice.cpp
+++ b/tests/validation/CL/StridedSlice.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/TableLookup.cpp b/tests/validation/CL/TableLookup.cpp
index fea3e58..b611ef6 100644
--- a/tests/validation/CL/TableLookup.cpp
+++ b/tests/validation/CL/TableLookup.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Threshold.cpp b/tests/validation/CL/Threshold.cpp
index 9c68ffe..215565e 100644
--- a/tests/validation/CL/Threshold.cpp
+++ b/tests/validation/CL/Threshold.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@
 
     // Create and configure function
     CLThreshold thrsh;
-    thrsh.configure(&src, &dst, threshold, false_value, true_value, type, upper);
+    thrsh.configure(&src, &dst, ThresholdKernelInfo(threshold, false_value, true_value, type, upper));
 
     // Validate valid region
     const ValidRegion valid_region = shape_to_valid_region(shape);

diff --git a/tests/validation/CL/Tile.cpp b/tests/validation/CL/Tile.cpp
index e1f0125..73f4aa8 100644
--- a/tests/validation/CL/Tile.cpp
+++ b/tests/validation/CL/Tile.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Transpose.cpp b/tests/validation/CL/Transpose.cpp
index d60e895..3a1a27d 100644
--- a/tests/validation/CL/Transpose.cpp
+++ b/tests/validation/CL/Transpose.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/UNIT/CompileContext.cpp b/tests/validation/CL/UNIT/CompileContext.cpp
index 5245044..06f7eae 100644
--- a/tests/validation/CL/UNIT/CompileContext.cpp
+++ b/tests/validation/CL/UNIT/CompileContext.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/UNIT/DynamicTensor.cpp b/tests/validation/CL/UNIT/DynamicTensor.cpp
index f5cc1f7..b630284 100644
--- a/tests/validation/CL/UNIT/DynamicTensor.cpp
+++ b/tests/validation/CL/UNIT/DynamicTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/UNIT/MemoryManager.cpp b/tests/validation/CL/UNIT/MemoryManager.cpp
index 8167026..4db062e 100644
--- a/tests/validation/CL/UNIT/MemoryManager.cpp
+++ b/tests/validation/CL/UNIT/MemoryManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/UNIT/TensorAllocator.cpp b/tests/validation/CL/UNIT/TensorAllocator.cpp
index a3aabf9..9db98fb 100644
--- a/tests/validation/CL/UNIT/TensorAllocator.cpp
+++ b/tests/validation/CL/UNIT/TensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/UNIT/Tuner.cpp b/tests/validation/CL/UNIT/Tuner.cpp
index 6345b7c..ee5c76c 100644
--- a/tests/validation/CL/UNIT/Tuner.cpp
+++ b/tests/validation/CL/UNIT/Tuner.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/UNIT/WeightsRetention.cpp b/tests/validation/CL/UNIT/WeightsRetention.cpp
index 9a6fddc..7234e47 100644
--- a/tests/validation/CL/UNIT/WeightsRetention.cpp
+++ b/tests/validation/CL/UNIT/WeightsRetention.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/Unstack.cpp b/tests/validation/CL/Unstack.cpp
index 13b8872..4f3fe5c 100644
--- a/tests/validation/CL/Unstack.cpp
+++ b/tests/validation/CL/Unstack.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/UpsampleLayer.cpp b/tests/validation/CL/UpsampleLayer.cpp
index f19ec89..eff51a4 100644
--- a/tests/validation/CL/UpsampleLayer.cpp
+++ b/tests/validation/CL/UpsampleLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/WarpAffine.cpp b/tests/validation/CL/WarpAffine.cpp
index 29f24cc..7779761 100644
--- a/tests/validation/CL/WarpAffine.cpp
+++ b/tests/validation/CL/WarpAffine.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/WarpPerspective.cpp b/tests/validation/CL/WarpPerspective.cpp
index e164a80..4b97505 100644
--- a/tests/validation/CL/WarpPerspective.cpp
+++ b/tests/validation/CL/WarpPerspective.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/WeightsReshape.cpp b/tests/validation/CL/WeightsReshape.cpp
index ff7ed82..3e7ecc3 100644
--- a/tests/validation/CL/WeightsReshape.cpp
+++ b/tests/validation/CL/WeightsReshape.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/WidthConcatenateLayer.cpp b/tests/validation/CL/WidthConcatenateLayer.cpp
index 7b894a6..408fe14 100644
--- a/tests/validation/CL/WidthConcatenateLayer.cpp
+++ b/tests/validation/CL/WidthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,7 +71,7 @@
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {
@@ -98,8 +98,8 @@
     ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
 
     // Create and configure function
-    CLConcatenateLayer       concat_layer;
-    std::vector<ICLTensor *> inputs;
+    CLConcatenateLayer             concat_layer;
+    std::vector<const ICLTensor *> inputs;
     inputs.emplace_back(&src1);
     inputs.emplace_back(&src2);
     inputs.emplace_back(&src3);

diff --git a/tests/validation/CL/Winograd.cpp b/tests/validation/CL/Winograd.cpp
index 511aa4b..771acf9 100644
--- a/tests/validation/CL/Winograd.cpp
+++ b/tests/validation/CL/Winograd.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CL/YOLOLayer.cpp b/tests/validation/CL/YOLOLayer.cpp
index b487ea6..f28082b 100644
--- a/tests/validation/CL/YOLOLayer.cpp
+++ b/tests/validation/CL/YOLOLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CPP/DFT.cpp b/tests/validation/CPP/DFT.cpp
index d4020f2..e19e850 100644
--- a/tests/validation/CPP/DFT.cpp
+++ b/tests/validation/CPP/DFT.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CPP/DetectionPostProcessLayer.cpp b/tests/validation/CPP/DetectionPostProcessLayer.cpp
index 934ffea..166299e 100644
--- a/tests/validation/CPP/DetectionPostProcessLayer.cpp
+++ b/tests/validation/CPP/DetectionPostProcessLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CPP/NonMaximumSuppression.cpp b/tests/validation/CPP/NonMaximumSuppression.cpp
index bf24b2c..b85fe50 100644
--- a/tests/validation/CPP/NonMaximumSuppression.cpp
+++ b/tests/validation/CPP/NonMaximumSuppression.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CPP/Permute.cpp b/tests/validation/CPP/Permute.cpp
index aab63e6..f0f5346 100644
--- a/tests/validation/CPP/Permute.cpp
+++ b/tests/validation/CPP/Permute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/CPP/TopKV.cpp b/tests/validation/CPP/TopKV.cpp
index e528c62..3ba89c9 100644
--- a/tests/validation/CPP/TopKV.cpp
+++ b/tests/validation/CPP/TopKV.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/ActivationLayer.cpp b/tests/validation/GLES_COMPUTE/ActivationLayer.cpp
index fdb9d18..21384c4 100644
--- a/tests/validation/GLES_COMPUTE/ActivationLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/ActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/ArithmeticAddition.cpp b/tests/validation/GLES_COMPUTE/ArithmeticAddition.cpp
index 82946fa..4a42094 100755
--- a/tests/validation/GLES_COMPUTE/ArithmeticAddition.cpp
+++ b/tests/validation/GLES_COMPUTE/ArithmeticAddition.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp b/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp
index d08459d..474dd02 100644
--- a/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/BatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp
index 2813d29..3e3c832 100644
--- a/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/DepthConcatenateLayer.cpp b/tests/validation/GLES_COMPUTE/DepthConcatenateLayer.cpp
index 04e91d6..6ad6fcf 100644
--- a/tests/validation/GLES_COMPUTE/DepthConcatenateLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/DepthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@
 TEST_SUITE(DepthConcatenateLayer)
 
 template <typename T>
-using GCDepthConcatenateLayerFixture = ConcatenateLayerValidationFixture<GCTensor, IGCTensor, GCAccessor, GCConcatenateLayer, T>;
+using GCDepthConcatenateLayerFixture = ConcatenateLayerValidationFixture<GCTensor, IGCTensor, GCAccessor, GCConcatenateLayer, T, false>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)

diff --git a/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
index c31cae3..52377f6 100644
--- a/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/DepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp
index 0942b07..ee95a01 100644
--- a/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/DirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/DirectConvolutionLayerTensorShift.cpp b/tests/validation/GLES_COMPUTE/DirectConvolutionLayerTensorShift.cpp
index 0da90470..8d5f93d 100644
--- a/tests/validation/GLES_COMPUTE/DirectConvolutionLayerTensorShift.cpp
+++ b/tests/validation/GLES_COMPUTE/DirectConvolutionLayerTensorShift.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/DropoutLayer.cpp b/tests/validation/GLES_COMPUTE/DropoutLayer.cpp
index 4d54dad..c32f130 100644
--- a/tests/validation/GLES_COMPUTE/DropoutLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/DropoutLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/FullyConnectedLayer.cpp b/tests/validation/GLES_COMPUTE/FullyConnectedLayer.cpp
index 1ef2fb9..53f63ce 100644
--- a/tests/validation/GLES_COMPUTE/FullyConnectedLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/FullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/GEMM.cpp b/tests/validation/GLES_COMPUTE/GEMM.cpp
index e8184fe..13af521 100644
--- a/tests/validation/GLES_COMPUTE/GEMM.cpp
+++ b/tests/validation/GLES_COMPUTE/GEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp b/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp
index 162f189..fd110c6 100644
--- a/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/GlobalPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/NormalizationLayer.cpp b/tests/validation/GLES_COMPUTE/NormalizationLayer.cpp
index 67dca32..d733a58 100644
--- a/tests/validation/GLES_COMPUTE/NormalizationLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/NormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/NormalizePlanarYUVLayer.cpp b/tests/validation/GLES_COMPUTE/NormalizePlanarYUVLayer.cpp
index 540a2be..ed6b5f0 100644
--- a/tests/validation/GLES_COMPUTE/NormalizePlanarYUVLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/NormalizePlanarYUVLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/PoolingLayer.cpp b/tests/validation/GLES_COMPUTE/PoolingLayer.cpp
index e507480..5424707 100644
--- a/tests/validation/GLES_COMPUTE/PoolingLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/PoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/Scale.cpp b/tests/validation/GLES_COMPUTE/Scale.cpp
index e6526b8..5e7f39f 100644
--- a/tests/validation/GLES_COMPUTE/Scale.cpp
+++ b/tests/validation/GLES_COMPUTE/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,7 @@
     DataType::F16,
 });
 
-/** Align corners, this functionality is supported only by NEON */
+/** Aligned corners, this functionality is supported only by NEON and OpenCL backends */
 const auto AlignCorners = framework::dataset::make("AlignCorners",
 {
     false,
@@ -90,7 +90,7 @@
 
     // Create and configure function
     GCScale gcscale;
-    gcscale.configure(&src, &dst, policy, border_mode, constant_border_value, sampling_policy);
+    gcscale.configure(&src, &dst, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy });
 
     // Get border size depending on border mode
     const BorderSize border_size(border_mode == BorderMode::UNDEFINED ? 0 : 1);

diff --git a/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp b/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
index 3b55717..af92cff 100644
--- a/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -89,7 +89,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
                                                                                                                      framework::dataset::make("Beta", 1.0f)),
-                                                                                                                     framework::dataset::make("Axis", 1)))
+                                                                                                                     framework::dataset::make("ReduceEndAxis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);
@@ -97,7 +97,7 @@
 FIXTURE_DATA_TEST_CASE(RunLarge, GCSoftmaxLayerFixture<half_float::half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    framework::dataset::make("Beta", 1.0f)),
-                                                                                                                   framework::dataset::make("Axis", 1)))
+                                                                                                                   framework::dataset::make("ReduceEndAxis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f16);
@@ -108,7 +108,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall, GCSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                   framework::dataset::make("DataType", DataType::F32)),
                                                                                                                   framework::dataset::make("Beta", 1.0f)),
-                                                                                                          framework::dataset::make("Axis", 1)))
+                                                                                                          framework::dataset::make("ReduceEndAxis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);
@@ -116,7 +116,7 @@
 FIXTURE_DATA_TEST_CASE(RunLarge, GCSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                         framework::dataset::make("DataType", DataType::F32)),
                                                                                                                 framework::dataset::make("Beta", 1.0f)),
-                                                                                                        framework::dataset::make("Axis", 1)))
+                                                                                                        framework::dataset::make("ReduceEndAxis", 0)))
 {
     // Validate output
     validate(GCAccessor(_target), _reference, tolerance_f32);

diff --git a/tests/validation/GLES_COMPUTE/Transpose.cpp b/tests/validation/GLES_COMPUTE/Transpose.cpp
index 7625784..d1c640d 100644
--- a/tests/validation/GLES_COMPUTE/Transpose.cpp
+++ b/tests/validation/GLES_COMPUTE/Transpose.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/UNIT/DynamicTensor.cpp b/tests/validation/GLES_COMPUTE/UNIT/DynamicTensor.cpp
index 248b7fd..72851a5 100644
--- a/tests/validation/GLES_COMPUTE/UNIT/DynamicTensor.cpp
+++ b/tests/validation/GLES_COMPUTE/UNIT/DynamicTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/GLES_COMPUTE/UNIT/MemoryManager.cpp b/tests/validation/GLES_COMPUTE/UNIT/MemoryManager.cpp
index 99ed84a..b4fd0eb 100644
--- a/tests/validation/GLES_COMPUTE/UNIT/MemoryManager.cpp
+++ b/tests/validation/GLES_COMPUTE/UNIT/MemoryManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index 6e93cd0..eb8bdcf 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index 9c88973..325cc00 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,7 +90,6 @@
         case DataType::F32:
             switch(activation)
             {
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
                 case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
                     // Reduce range as exponent overflows
                     bounds = std::make_pair(-40.f, 40.f);

diff --git a/tests/validation/NEON/AbsoluteDifference.cpp b/tests/validation/NEON/AbsoluteDifference.cpp
index a2debcc..9e9a7db 100644
--- a/tests/validation/NEON/AbsoluteDifference.cpp
+++ b/tests/validation/NEON/AbsoluteDifference.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Accumulate.cpp b/tests/validation/NEON/Accumulate.cpp
index 6176eb0..e490698 100644
--- a/tests/validation/NEON/Accumulate.cpp
+++ b/tests/validation/NEON/Accumulate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 063bfaa..33e2850 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,6 +60,7 @@
         case ActivationLayerInfo::ActivationFunction::ELU:
         case ActivationLayerInfo::ActivationFunction::SQRT:
         case ActivationLayerInfo::ActivationFunction::TANH:
+        case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
             switch(data_type)
             {
                 case DataType::F16:
@@ -87,6 +88,7 @@
         case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
         case ActivationLayerInfo::ActivationFunction::SQRT:
         case ActivationLayerInfo::ActivationFunction::TANH:
+        case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
             switch(data_type)
             {
                 case DataType::F16:

diff --git a/tests/validation/NEON/ArgMinMax.cpp b/tests/validation/NEON/ArgMinMax.cpp
index e7ab4a4..6aa382d 100644
--- a/tests/validation/NEON/ArgMinMax.cpp
+++ b/tests/validation/NEON/ArgMinMax.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp
index d1b6ce2..7b3d4f9 100644
--- a/tests/validation/NEON/ArithmeticAddition.cpp
+++ b/tests/validation/NEON/ArithmeticAddition.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,6 +52,8 @@
                                                  DataType::U8));
 const auto ArithmeticAdditionS16Dataset = combine(combine(framework::dataset::make("DataType", { DataType::U8, DataType::S16 }), framework::dataset::make("DataType", DataType::S16)),
                                                   framework::dataset::make("DataType", DataType::S16));
+const auto ArithmeticAdditionS32Dataset = combine(combine(framework::dataset::make("DataType", { DataType::S32 }), framework::dataset::make("DataType", DataType::S32)),
+                                                  framework::dataset::make("DataType", DataType::S32));
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 const auto ArithmeticAdditionFP16Dataset = combine(combine(framework::dataset::make("DataType", DataType::F16), framework::dataset::make("DataType", DataType::F16)),
                                                    framework::dataset::make("DataType", DataType::F16));
@@ -61,7 +63,7 @@
 const auto ArithmeticAdditionQASYMM8Dataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8), framework::dataset::make("DataType", DataType::QASYMM8)),
                                                       framework::dataset::make("DataType", DataType::QASYMM8));
 const auto ArithmeticAdditionQASYMM8SIGNEDDataset = combine(combine(framework::dataset::make("DataType", DataType::QASYMM8_SIGNED), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                      framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
+                                                            framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
 const auto ArithmeticAdditionQSYMM16Dataset = combine(combine(framework::dataset::make("DataType", DataType::QSYMM16), framework::dataset::make("DataType", DataType::QSYMM16)),
                                                       framework::dataset::make("DataType", DataType::QSYMM16));
 } // namespace
@@ -105,6 +107,22 @@
 // clang-format on
 // *INDENT-ON*
 
+TEST_CASE(NoPaddingAdded, framework::DatasetMode::PRECOMMIT)
+{
+    // NEArithmeticAddition doesn't use padding, so make sure this is the case.
+    Tensor input1 = create_tensor<Tensor>(TensorShape(15U, 15U), DataType::F32);
+    Tensor input2 = create_tensor<Tensor>(TensorShape(15U, 1U), DataType::F32);
+    Tensor output = create_tensor<Tensor>(TensorShape(15U, 15U), DataType::F32);
+
+    NEArithmeticAddition add;
+    add.configure(&input1, &input2, &output, ConvertPolicy::WRAP);
+
+    // Validate padding is zero
+    validate(input1.info()->padding(), PaddingSize());
+    validate(input2.info()->padding(), PaddingSize());
+    validate(output.info()->padding(), PaddingSize());
+}
+
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticAdditionU8Dataset),
@@ -130,6 +148,15 @@
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // S16
+
+TEST_SUITE(S32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticAdditionFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticAdditionS32Dataset),
+                                                                                                            framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // S32
 TEST_SUITE_END() // Integer
 
 TEST_SUITE(Float)
@@ -183,6 +210,9 @@
 template <typename T>
 using NEArithmeticAdditionQuantizedFixture = ArithmeticAdditionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticAddition, T>;
 
+template <typename T>
+using NEArithmeticAdditionQuantizedBroadcastFixture = ArithmeticAdditionValidationQuantizedBroadcastFixture<Tensor, Accessor, NEArithmeticAddition, T>;
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
@@ -220,6 +250,21 @@
     validate(Accessor(_target), _reference, tolerance_quant);
 #endif //__aarch64__
 }
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionQuantizedBroadcastFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(
+                           datasets::SmallShapesBroadcast(), ArithmeticAdditionQASYMM8SIGNEDDataset),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                       framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
+                       framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
+                       framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })))
+{
+    // Validate output
+#ifdef __aarch64__
+    validate(Accessor(_target), _reference);
+#else  //__aarch64__
+    validate(Accessor(_target), _reference, tolerance_quant);
+#endif //__aarch64__
+}
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM16)

diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp
index e5c2c5f..f468f6d 100644
--- a/tests/validation/NEON/ArithmeticSubtraction.cpp
+++ b/tests/validation/NEON/ArithmeticSubtraction.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,6 +88,8 @@
 const auto ArithmeticSubtractionQuantizationInfoSymmetric = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.3f, 0) }),
                                                                             framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.7f, 0) })),
                                                                     framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.2f, 0) }));
+const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
+const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -101,7 +103,6 @@
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
         framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),      // Window shrink
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),      // Invalid data type combination
                                                  TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),     // Mismatching shapes
                                                  TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::QASYMM8), // Mismatching types
@@ -109,7 +110,6 @@
         }),
         framework::dataset::make("Input2Info",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
@@ -117,19 +117,17 @@
         })),
         framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::S16),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),
                                                 TensorInfo(TensorShape(48U, 11U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::QASYMM8),
         })),
         framework::dataset::make("ConvertPolicy",{ ConvertPolicy::WRAP,
                                                 ConvertPolicy::SATURATE,
-                                                ConvertPolicy::WRAP,
                                                 ConvertPolicy::SATURATE,
                                                 ConvertPolicy::WRAP,
                                                 ConvertPolicy::WRAP,
         })),
-        framework::dataset::make("Expected", { true, true, false, false, false, false, false})),
+        framework::dataset::make("Expected", { true, true, false, false, false, false})),
         input1_info, input2_info, output_info, policy, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NEArithmeticSubtraction::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), policy)) == expected, framework::LogLevel::ERRORS);
@@ -137,24 +135,79 @@
 // clang-format on
 // *INDENT-ON*
 
+TEST_SUITE(InPlaceValidate)
+TEST_CASE(SingleTensor, framework::DatasetMode::ALL)
+{
+    const auto random_shape       = TensorShape{ 9, 9 };
+    const auto single_tensor_info = TensorInfo{ random_shape, 1, DataType::F32 };
+
+    Status result = NEArithmeticSubtraction::validate(&single_tensor_info, &single_tensor_info, &single_tensor_info, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(ValidBroadCast, framework::DatasetMode::ALL)
+{
+    const auto larger_shape  = TensorShape{ 27U, 13U, 2U };
+    const auto smaller_shape = TensorShape{ 1U, 13U, 2U };
+
+    const auto larger_tensor_info  = TensorInfo{ larger_shape, 1, DataType::F32 };
+    const auto smaller_tensor_info = TensorInfo{ smaller_shape, 1, DataType::F32 };
+
+    Status result = NEArithmeticSubtraction::validate(&larger_tensor_info, &smaller_tensor_info, &larger_tensor_info, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(InvalidBroadcastOutput, framework::DatasetMode::ALL)
+{
+    const auto larger_shape  = TensorShape{ 27U, 13U, 2U };
+    const auto smaller_shape = TensorShape{ 1U, 13U, 2U };
+
+    const auto larger_tensor_info  = TensorInfo{ larger_shape, 1, DataType::F32 };
+    const auto smaller_tensor_info = TensorInfo{ smaller_shape, 1, DataType::F32 };
+
+    Status result = NEArithmeticSubtraction::validate(&larger_tensor_info, &smaller_tensor_info, &smaller_tensor_info, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(InvalidBroadcastBoth, framework::DatasetMode::ALL)
+{
+    const auto shape0 = TensorShape{ 9U, 9U };
+    const auto shape1 = TensorShape{ 9U, 1U, 2U };
+
+    const auto info0 = TensorInfo{ shape0, 1, DataType::F32 };
+    const auto info1 = TensorInfo{ shape1, 1, DataType::F32 };
+
+    Status result{};
+
+    result = NEArithmeticSubtraction::validate(&info0, &info1, &info0, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+
+    result = NEArithmeticSubtraction::validate(&info0, &info1, &info1, ConvertPolicy::WRAP);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // InPlaceValidate
+
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionU8Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionU8Dataset),
+                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                     OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // U8
 
-using NEArithmeticSubtractionQASYMM8Fixture       = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, uint8_t>;
-using NEArithmeticSubtractionQASYMM8SignedFixture = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
-using NEArithmeticSubtractionQSYMM16Fixture       = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int16_t>;
+using NEArithmeticSubtractionQASYMM8Fixture                = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, uint8_t>;
+using NEArithmeticSubtractionQASYMM8SignedFixture          = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
+using NEArithmeticSubtractionQASYMM8SignedBroadcastFixture = ArithmeticSubtractionValidationQuantizedBroadcastFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
+using NEArithmeticSubtractionQSYMM16Fixture                = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int16_t>;
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionQASYMM8Dataset),
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionQASYMM8Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                                                                             ArithmeticSubtractionQuantizationInfoDataset))
+                                                                                                                     ArithmeticSubtractionQuantizationInfoDataset),
+                                                                                                             InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -162,11 +215,23 @@
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
                                                                                                                        datasets::SmallShapes(),
                                                                                                                        ArithmeticSubtractionQASYMM8SIGNEDDataset),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                                                                                   ArithmeticSubtractionQuantizationInfoSignedDataset))
+                                                                                                                   ArithmeticSubtractionQuantizationInfoSignedDataset),
+                                                                                                                   InPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
+                           datasets::SmallShapesBroadcast(),
+                           ArithmeticSubtractionQASYMM8SIGNEDDataset),
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+                       ArithmeticSubtractionQuantizationInfoSignedDataset),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -174,11 +239,12 @@
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
         datasets::SmallShapes(),
         ArithmeticSubtractionQSYMM16Dataset),
                                                                                                                      framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                                                                             ArithmeticSubtractionQuantizationInfoSymmetric))
+                                                                                                                     ArithmeticSubtractionQuantizationInfoSymmetric),
+                                                                                                             OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -187,15 +253,17 @@
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS16Dataset),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionS16Dataset),
+                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                     OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS16Dataset),
-                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionS16Dataset),
+                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                   OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -205,8 +273,9 @@
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(F16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<half>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
-                                                                                                            framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP16Dataset),
+                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                            OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -215,15 +284,17 @@
 #endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 TEST_SUITE(F32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP32Dataset),
-                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), ArithmeticSubtractionFP32Dataset),
+                                                                                                                   framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                   InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(), ArithmeticSubtractionFP32Dataset),
-                                                                                                                 framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), ArithmeticSubtractionFP32Dataset),
+                                                                                                                 framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                                                                                                                 OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -232,17 +303,19 @@
 template <typename T>
 using NEArithmeticSubtractionBroadcastFixture = ArithmeticSubtractionBroadcastValidationFixture<Tensor, Accessor, NEArithmeticSubtraction, T>;
 
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
                        ArithmeticSubtractionFP32Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapesBroadcast(),
+FIXTURE_DATA_TEST_CASE(RunLargeBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapesBroadcast(),
                        ArithmeticSubtractionFP32Dataset),
-                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })))
+                       framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
+                       OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference);

diff --git a/tests/validation/NEON/BatchConcatenateLayer.cpp b/tests/validation/NEON/BatchConcatenateLayer.cpp
index 2c0ebc3..6eafe82 100644
--- a/tests/validation/NEON/BatchConcatenateLayer.cpp
+++ b/tests/validation/NEON/BatchConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,7 +70,7 @@
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {

diff --git a/tests/validation/NEON/BatchNormalizationLayer.cpp b/tests/validation/NEON/BatchNormalizationLayer.cpp
index 58b7474..067c5bb 100644
--- a/tests/validation/NEON/BatchNormalizationLayer.cpp
+++ b/tests/validation/NEON/BatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,69 +71,34 @@
 template <typename T>
 using NEBatchNormalizationLayerFixture = BatchNormalizationLayerValidationFixture<Tensor, Accessor, NEBatchNormalizationLayer, T>;
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallRandomBatchNormalizationLayerDataset(),
-                                                                                   combine(framework::dataset::make("UseBeta", { false, true }), framework::dataset::make("UseGamma", { false, true }))),
-                                                                           framework::dataset::make("DataType", { DataType::F32 })),
-                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-               shape0, shape1, epsilon, use_beta, use_gamma, dt, data_layout)
-{
-    TensorShape src_dst_shapes = shape0;
-    if(data_layout == DataLayout::NHWC)
-    {
-        permute(src_dst_shapes, PermutationVector(2U, 0U, 1U));
-    }
-
-    // Create tensors
-    Tensor src   = create_tensor<Tensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
-    Tensor dst   = create_tensor<Tensor>(src_dst_shapes, dt, 1, QuantizationInfo(), data_layout);
-    Tensor mean  = create_tensor<Tensor>(shape1, dt, 1);
-    Tensor var   = create_tensor<Tensor>(shape1, dt, 1);
-    Tensor beta  = create_tensor<Tensor>(shape1, dt, 1);
-    Tensor gamma = create_tensor<Tensor>(shape1, dt, 1);
-
-    // Create and Configure function
-    NEBatchNormalizationLayer norm;
-    Tensor                   *beta_ptr  = use_beta ? &beta : nullptr;
-    Tensor                   *gamma_ptr = use_gamma ? &gamma : nullptr;
-    norm.configure(&src, &dst, &mean, &var, beta_ptr, gamma_ptr, epsilon);
-
-    // Validate valid region
-    const ValidRegion valid_region = shape_to_valid_region(src_dst_shapes);
-    validate(dst.info()->valid_region(), valid_region);
-}
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Window shrink
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching data types
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Mismatching data types
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Invalid mean/var/beta/gamma shape
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),    // Fused activation's a < b
                                                      }),
-               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F16),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                                      })),
                framework::dataset::make("MVBGInfo",{ TensorInfo(TensorShape(2U), 1, DataType::F32),
-                                                     TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(2U), 1, DataType::F16),
                                                      TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(5U), 1, DataType::F32),
                                                      TensorInfo(TensorShape(2U), 1, DataType::F32),
                                                    })),
                framework::dataset::make("ActivationLayerInfo",{ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
-                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
                                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f),
                                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f),
                                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f),
                                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 2.f, 6.f),
                                                    })),
-               framework::dataset::make("Expected", { true, false, false, false, false, false})),
+               framework::dataset::make("Expected", { true, false, false, false, false})),
                input_info, output_info, mvbg_info, act_info, expected)
 {
     const auto &mean_info = mvbg_info;

diff --git a/tests/validation/NEON/BatchToSpaceLayer.cpp b/tests/validation/NEON/BatchToSpaceLayer.cpp
index b5d684a..ff78ce8 100644
--- a/tests/validation/NEON/BatchToSpaceLayer.cpp
+++ b/tests/validation/NEON/BatchToSpaceLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/BitwiseAnd.cpp b/tests/validation/NEON/BitwiseAnd.cpp
index 773c985..f10be8d 100644
--- a/tests/validation/NEON/BitwiseAnd.cpp
+++ b/tests/validation/NEON/BitwiseAnd.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/BitwiseNot.cpp b/tests/validation/NEON/BitwiseNot.cpp
index abc091c..a53e77d 100644
--- a/tests/validation/NEON/BitwiseNot.cpp
+++ b/tests/validation/NEON/BitwiseNot.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/BitwiseOr.cpp b/tests/validation/NEON/BitwiseOr.cpp
index 1a33443..f74594a 100644
--- a/tests/validation/NEON/BitwiseOr.cpp
+++ b/tests/validation/NEON/BitwiseOr.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/BitwiseXor.cpp b/tests/validation/NEON/BitwiseXor.cpp
index 2a8c0c9..094a69b 100644
--- a/tests/validation/NEON/BitwiseXor.cpp
+++ b/tests/validation/NEON/BitwiseXor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/BoundingBoxTransform.cpp b/tests/validation/NEON/BoundingBoxTransform.cpp
index 3f16b45..c662c41 100644
--- a/tests/validation/NEON/BoundingBoxTransform.cpp
+++ b/tests/validation/NEON/BoundingBoxTransform.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Box3x3.cpp b/tests/validation/NEON/Box3x3.cpp
index 21bce05..b4fd06a 100644
--- a/tests/validation/NEON/Box3x3.cpp
+++ b/tests/validation/NEON/Box3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/CannyEdge.cpp b/tests/validation/NEON/CannyEdge.cpp
index 7c4cd80..42222c0 100644
--- a/tests/validation/NEON/CannyEdge.cpp
+++ b/tests/validation/NEON/CannyEdge.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Cast.cpp b/tests/validation/NEON/Cast.cpp
index fd66be3..c4c350a 100644
--- a/tests/validation/NEON/Cast.cpp
+++ b/tests/validation/NEON/Cast.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ChannelCombine.cpp b/tests/validation/NEON/ChannelCombine.cpp
index fcba07f..1a400f2 100644
--- a/tests/validation/NEON/ChannelCombine.cpp
+++ b/tests/validation/NEON/ChannelCombine.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ChannelExtract.cpp b/tests/validation/NEON/ChannelExtract.cpp
index 542cd1a..db7a9cf 100644
--- a/tests/validation/NEON/ChannelExtract.cpp
+++ b/tests/validation/NEON/ChannelExtract.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ChannelShuffle.cpp b/tests/validation/NEON/ChannelShuffle.cpp
index ec84183..d7b98d9 100644
--- a/tests/validation/NEON/ChannelShuffle.cpp
+++ b/tests/validation/NEON/ChannelShuffle.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Col2Im.cpp b/tests/validation/NEON/Col2Im.cpp
index e4a52f2..5f98b45 100644
--- a/tests/validation/NEON/Col2Im.cpp
+++ b/tests/validation/NEON/Col2Im.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ColorConvert.cpp b/tests/validation/NEON/ColorConvert.cpp
index c776479..be64ee0 100644
--- a/tests/validation/NEON/ColorConvert.cpp
+++ b/tests/validation/NEON/ColorConvert.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Comparisons.cpp b/tests/validation/NEON/Comparisons.cpp
index f080c83..8dc78d8 100644
--- a/tests/validation/NEON/Comparisons.cpp
+++ b/tests/validation/NEON/Comparisons.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ConvertFullyConnectedWeights.cpp b/tests/validation/NEON/ConvertFullyConnectedWeights.cpp
index 9050984..e0fefe3 100644
--- a/tests/validation/NEON/ConvertFullyConnectedWeights.cpp
+++ b/tests/validation/NEON/ConvertFullyConnectedWeights.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Convolution.cpp b/tests/validation/NEON/Convolution.cpp
index b942ddc..96e07dd 100644
--- a/tests/validation/NEON/Convolution.cpp
+++ b/tests/validation/NEON/Convolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 19f69d1..80615c5 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -464,6 +464,18 @@
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, NEGEMMConvolutionLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                               framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                               QuantizationData),
+                                       QuantizedActivationFunctionsDataset),
+                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 

diff --git a/tests/validation/NEON/Copy.cpp b/tests/validation/NEON/Copy.cpp
index 350fd00..7bcb008 100644
--- a/tests/validation/NEON/Copy.cpp
+++ b/tests/validation/NEON/Copy.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/CropResize.cpp b/tests/validation/NEON/CropResize.cpp
index 1feed3d..298d393 100644
--- a/tests/validation/NEON/CropResize.cpp
+++ b/tests/validation/NEON/CropResize.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,6 @@
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(15U, 30U, 40U, 10U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(15U, 30U, 40U, 10U), 1, DataType::U8),  // Invalid input data type.
                                                        TensorInfo(TensorShape(15U, 30U, 40U, 10U), 1, DataType::S32), // Invalid box_ind shape.
                                                        TensorInfo(TensorShape(15U, 30U, 40U, 10U), 1, DataType::S32), // Invalid output shape.
                                                        TensorInfo(TensorShape(15U, 30U, 40U, 10U), 1, DataType::S32), // Invalid output data type.
@@ -64,11 +63,9 @@
                                                        TensorInfo(TensorShape(4, 20), 1, DataType::F32),
                                                        TensorInfo(TensorShape(4, 20), 1, DataType::F32),
                                                        TensorInfo(TensorShape(4, 20), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(4, 20), 1, DataType::F32),
                                                        TensorInfo(TensorShape(3, 20), 1, DataType::F32),
                                                      })),
                framework::dataset::make("BoxIndInfo",{ TensorInfo(TensorShape(20), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(20), 1, DataType::S32),
                                                        TensorInfo(TensorShape(10), 1, DataType::S32),
                                                        TensorInfo(TensorShape(20), 1, DataType::S32),
                                                        TensorInfo(TensorShape(20), 1, DataType::S32),
@@ -77,13 +74,12 @@
                                                      })),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(15U, 5, 5, 20U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(15U, 5, 5, 20U), 1, DataType::F32),
-                                                       TensorInfo(TensorShape(15U, 5, 5, 20U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(15U, 5, 5, 10U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(15U, 5, 5, 20U), 1, DataType::S32),
                                                        TensorInfo(TensorShape(5U, 5, 5, 20U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(15U, 5, 5, 20U), 1, DataType::F32),
                                                      })),
-               framework::dataset::make("Expected", { true, false, false, false, false, false, false})),
+               framework::dataset::make("Expected", { true, false, false, false, false, false})),
                input, boxes, box_ind, output, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NECropResize::validate(&input.clone()->set_data_layout(DataLayout::NHWC).set_is_resizable(false),
@@ -100,7 +96,7 @@
 TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NECropResizeFixture<half>,
-                       framework::DatasetMode::PRECOMMIT,
+                       framework::DatasetMode::ALL,
                        combine(datasets::SmallCropResizeDataset(),
                                combine(framework::dataset::make("IsOutOfBounds", { true, false }),
                                        framework::dataset::make("DataType", DataType::F16))))
@@ -114,7 +110,7 @@
 TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NECropResizeFixture<float>,
-                       framework::DatasetMode::PRECOMMIT,
+                       framework::DatasetMode::ALL,
                        combine(datasets::SmallCropResizeDataset(),
                                combine(framework::dataset::make("IsOutOfBounds", { true, false }),
                                        framework::dataset::make("DataType", DataType::F32))))
@@ -125,10 +121,23 @@
 TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
+TEST_SUITE(U8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NECropResizeFixture<uint8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::SmallCropResizeDataset(),
+                               combine(framework::dataset::make("IsOutOfBounds", { true, false }),
+                                       framework::dataset::make("DataType", DataType::U8))))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_fp32, 0.01);
+}
+TEST_SUITE_END() // U8
+
 TEST_SUITE(U16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NECropResizeFixture<uint16_t>,
-                       framework::DatasetMode::PRECOMMIT,
+                       framework::DatasetMode::ALL,
                        combine(datasets::SmallCropResizeDataset(),
                                combine(framework::dataset::make("IsOutOfBounds", { true, false }),
                                        framework::dataset::make("DataType", DataType::U16))))

diff --git a/tests/validation/NEON/DeconvolutionLayer.cpp b/tests/validation/NEON/DeconvolutionLayer.cpp
index d888d7b..734e24f 100644
--- a/tests/validation/NEON/DeconvolutionLayer.cpp
+++ b/tests/validation/NEON/DeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/DepthConcatenateLayer.cpp b/tests/validation/NEON/DepthConcatenateLayer.cpp
index eea7c4d..1c69d44 100644
--- a/tests/validation/NEON/DepthConcatenateLayer.cpp
+++ b/tests/validation/NEON/DepthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,7 +67,7 @@
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {

diff --git a/tests/validation/NEON/DepthConvertLayer.cpp b/tests/validation/NEON/DepthConvertLayer.cpp
index 7af467b..8b581a2 100644
--- a/tests/validation/NEON/DepthConvertLayer.cpp
+++ b/tests/validation/NEON/DepthConvertLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/DepthToSpaceLayer.cpp b/tests/validation/NEON/DepthToSpaceLayer.cpp
index abc8c4f..d93437e 100644
--- a/tests/validation/NEON/DepthToSpaceLayer.cpp
+++ b/tests/validation/NEON/DepthToSpaceLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index e4a136e..407ebe3 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
index 64f6a93..0e5024f 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayerNative.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/DequantizationLayer.cpp b/tests/validation/NEON/DequantizationLayer.cpp
index 4389419..f4defcd 100644
--- a/tests/validation/NEON/DequantizationLayer.cpp
+++ b/tests/validation/NEON/DequantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Derivative.cpp b/tests/validation/NEON/Derivative.cpp
index bc778de..0a04778 100644
--- a/tests/validation/NEON/Derivative.cpp
+++ b/tests/validation/NEON/Derivative.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/DetectionPostProcessLayer.cpp b/tests/validation/NEON/DetectionPostProcessLayer.cpp
index 4413ed4..80b1856 100644
--- a/tests/validation/NEON/DetectionPostProcessLayer.cpp
+++ b/tests/validation/NEON/DetectionPostProcessLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Dilate.cpp b/tests/validation/NEON/Dilate.cpp
index b75848b..668d2b0 100644
--- a/tests/validation/NEON/Dilate.cpp
+++ b/tests/validation/NEON/Dilate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/DilatedConvolutionLayer.cpp b/tests/validation/NEON/DilatedConvolutionLayer.cpp
index 97afa24..4c1e532 100644
--- a/tests/validation/NEON/DilatedConvolutionLayer.cpp
+++ b/tests/validation/NEON/DilatedConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index 05bfbc1..88578ca 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -98,6 +98,25 @@
 const auto data_precommit    = combine(data, framework::dataset::make("NumKernels", { 1 }));
 const auto data_precommit9x9 = combine(data9x9, framework::dataset::make("NumKernels", { 4 }));
 
+/* The following tests is from real use-case that made DirectConvolution
+ * overflows in terms of its tensor indexing. This test case is using
+ * a separate tolerance due to the following reason.
+ * - It has shown that it requires generally larger absolute tolerance
+ *   for large numbers or larger relative tolerance for small numbers.
+ * - With the first reason, since it is mainly testing index overflow,
+ *   a value with a margin is used to avoid uninteded test failures
+ *   during nightly.
+ */
+constexpr AbsoluteTolerance<float> usecase_tolerance_fp32(0.05f);
+
+const auto data_nightly_usecase = combine(framework::dataset::make("InputShape", { TensorShape{ 3U, 800U, 800U } }),
+                                          combine(framework::dataset::make("StrideX", { 1 }),
+                                                  combine(framework::dataset::make("StrideY", { 1 }),
+                                                          combine(framework::dataset::make("PadX", { 4 }),
+                                                                  combine(framework::dataset::make("PadY", { 4 }),
+                                                                          combine(framework::dataset::make("KernelSize", 9),
+                                                                                  framework::dataset::make("NumKernels", { 16 })))))));
+
 /** Activation function Dataset*/
 const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 {
@@ -227,6 +246,14 @@
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
+FIXTURE_DATA_TEST_CASE(RunLargeUsecase, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly_usecase, framework::dataset::make("DataType",
+                       DataType::F32)),
+                       framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })),
+                       framework::dataset::make("DataLayout", { DataLayout::NHWC })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, usecase_tolerance_fp32);
+}
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // DirectConvolutionLayer

diff --git a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
index 3aee46f..000a6de 100644
--- a/tests/validation/NEON/ElementwiseAbsoluteValue.cpp
+++ b/tests/validation/NEON/ElementwiseAbsoluteValue.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ElementwiseDivision.cpp b/tests/validation/NEON/ElementwiseDivision.cpp
index f5e1f86..f6e0a65 100644
--- a/tests/validation/NEON/ElementwiseDivision.cpp
+++ b/tests/validation/NEON/ElementwiseDivision.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ElementwiseExpLayer.cpp b/tests/validation/NEON/ElementwiseExpLayer.cpp
index 37a8bb7..5b6f33e 100644
--- a/tests/validation/NEON/ElementwiseExpLayer.cpp
+++ b/tests/validation/NEON/ElementwiseExpLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ElementwiseLog.cpp b/tests/validation/NEON/ElementwiseLog.cpp
index 870c12e..4c5a35d 100644
--- a/tests/validation/NEON/ElementwiseLog.cpp
+++ b/tests/validation/NEON/ElementwiseLog.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ElementwiseMax.cpp b/tests/validation/NEON/ElementwiseMax.cpp
index bd61ba5..449d5db 100644
--- a/tests/validation/NEON/ElementwiseMax.cpp
+++ b/tests/validation/NEON/ElementwiseMax.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ElementwiseMin.cpp b/tests/validation/NEON/ElementwiseMin.cpp
index 0fc6f5f..6678a1b 100644
--- a/tests/validation/NEON/ElementwiseMin.cpp
+++ b/tests/validation/NEON/ElementwiseMin.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ElementwiseNegation.cpp b/tests/validation/NEON/ElementwiseNegation.cpp
index 7e7c838..e121b13 100644
--- a/tests/validation/NEON/ElementwiseNegation.cpp
+++ b/tests/validation/NEON/ElementwiseNegation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -70,19 +70,21 @@
 }
 
 template <typename T>
-using NENegLayerFixture = NegValidationFixture<Tensor, Accessor, NENegLayer, T>;
+using NENegLayerFixture = NegValidationInPlaceFixture<Tensor, Accessor, NENegLayer, T>;
 
 TEST_SUITE(Float)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                     DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(),
+                                                                                                             framework::dataset::make("DataType", DataType::F16)),
+                                                                                                     framework::dataset::make("InPlace", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                   DataType::F16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                           framework::dataset::make("DataType", DataType::F16)),
+                                                                                                   framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp16);
@@ -92,15 +94,17 @@
 #endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(),
+                                                                                                        framework::dataset::make("DataType", DataType::F32)),
+                                                                                                framework::dataset::make("InPlace", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                    DataType::F32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                            framework::dataset::make("DataType", DataType::F32)),
+                                                                                                    framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_fp32);
@@ -110,15 +114,17 @@
 
 TEST_SUITE(Integer)
 TEST_SUITE(S32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<int32_t>, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                  DataType::S32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, NENegLayerFixture<int32_t>, framework::DatasetMode::ALL, combine(combine(datasets::SmallShapes(),
+                                                                                                          framework::dataset::make("DataType", DataType::S32)),
+                                                                                                  framework::dataset::make("InPlace", { true, false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                      DataType::S32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, NENegLayerFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeShapes(),
+                                                                                                              framework::dataset::make("DataType", DataType::S32)),
+                                                                                                      framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);

diff --git a/tests/validation/NEON/ElementwisePower.cpp b/tests/validation/NEON/ElementwisePower.cpp
index 3ca39e8..bdca861 100644
--- a/tests/validation/NEON/ElementwisePower.cpp
+++ b/tests/validation/NEON/ElementwisePower.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ElementwiseRound.cpp b/tests/validation/NEON/ElementwiseRound.cpp
index 1d8cff6..fc19434 100644
--- a/tests/validation/NEON/ElementwiseRound.cpp
+++ b/tests/validation/NEON/ElementwiseRound.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
index 0edc06a..58efa28 100644
--- a/tests/validation/NEON/ElementwiseRsqrtLayer.cpp
+++ b/tests/validation/NEON/ElementwiseRsqrtLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ElementwiseSin.cpp b/tests/validation/NEON/ElementwiseSin.cpp
index c68d1e5..2e93ce3 100644
--- a/tests/validation/NEON/ElementwiseSin.cpp
+++ b/tests/validation/NEON/ElementwiseSin.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ElementwiseSquareDiff.cpp b/tests/validation/NEON/ElementwiseSquareDiff.cpp
index cf55c6a..e81edf7 100644
--- a/tests/validation/NEON/ElementwiseSquareDiff.cpp
+++ b/tests/validation/NEON/ElementwiseSquareDiff.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/EqualizeHistogram.cpp b/tests/validation/NEON/EqualizeHistogram.cpp
index ad7d710..e1d3986 100644
--- a/tests/validation/NEON/EqualizeHistogram.cpp
+++ b/tests/validation/NEON/EqualizeHistogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Erode.cpp b/tests/validation/NEON/Erode.cpp
index 0b87e60..ff9c927 100644
--- a/tests/validation/NEON/Erode.cpp
+++ b/tests/validation/NEON/Erode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/FFT.cpp b/tests/validation/NEON/FFT.cpp
index d762630..7f1c7c5 100644
--- a/tests/validation/NEON/FFT.cpp
+++ b/tests/validation/NEON/FFT.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/FastCorners.cpp b/tests/validation/NEON/FastCorners.cpp
index 4416662..389aa60 100644
--- a/tests/validation/NEON/FastCorners.cpp
+++ b/tests/validation/NEON/FastCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Fill.cpp b/tests/validation/NEON/Fill.cpp
index 59cc2cb..5fe92ca 100644
--- a/tests/validation/NEON/Fill.cpp
+++ b/tests/validation/NEON/Fill.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/FillBorder.cpp b/tests/validation/NEON/FillBorder.cpp
index 7e0fb1a..b567b3f 100644
--- a/tests/validation/NEON/FillBorder.cpp
+++ b/tests/validation/NEON/FillBorder.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Flatten.cpp b/tests/validation/NEON/Flatten.cpp
index d4742a7..daadd56 100644
--- a/tests/validation/NEON/Flatten.cpp
+++ b/tests/validation/NEON/Flatten.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Floor.cpp b/tests/validation/NEON/Floor.cpp
index 978b6db..419ce56 100644
--- a/tests/validation/NEON/Floor.cpp
+++ b/tests/validation/NEON/Floor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/FullyConnectedLayer.cpp b/tests/validation/NEON/FullyConnectedLayer.cpp
index 523b3c6..d8c2203 100644
--- a/tests/validation/NEON/FullyConnectedLayer.cpp
+++ b/tests/validation/NEON/FullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/FuseBatchNormalization.cpp b/tests/validation/NEON/FuseBatchNormalization.cpp
index 1a8f928..9fc3353 100644
--- a/tests/validation/NEON/FuseBatchNormalization.cpp
+++ b/tests/validation/NEON/FuseBatchNormalization.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/GEMM.cpp b/tests/validation/NEON/GEMM.cpp
index 2bcdf8a..f817390 100644
--- a/tests/validation/NEON/GEMM.cpp
+++ b/tests/validation/NEON/GEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index c3747dd..579499d 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Gather.cpp b/tests/validation/NEON/Gather.cpp
index 6c7faa0..af534ba 100644
--- a/tests/validation/NEON/Gather.cpp
+++ b/tests/validation/NEON/Gather.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Gaussian3x3.cpp b/tests/validation/NEON/Gaussian3x3.cpp
index 369e768..7396be7 100644
--- a/tests/validation/NEON/Gaussian3x3.cpp
+++ b/tests/validation/NEON/Gaussian3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Gaussian5x5.cpp b/tests/validation/NEON/Gaussian5x5.cpp
index b748d73..6c4c480 100644
--- a/tests/validation/NEON/Gaussian5x5.cpp
+++ b/tests/validation/NEON/Gaussian5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/GaussianPyramid.cpp b/tests/validation/NEON/GaussianPyramid.cpp
index c646b50..ed8e43c 100644
--- a/tests/validation/NEON/GaussianPyramid.cpp
+++ b/tests/validation/NEON/GaussianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/GenerateProposalsLayer.cpp b/tests/validation/NEON/GenerateProposalsLayer.cpp
index dd9c1ef..d14b8cb 100644
--- a/tests/validation/NEON/GenerateProposalsLayer.cpp
+++ b/tests/validation/NEON/GenerateProposalsLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/GlobalPoolingLayer.cpp b/tests/validation/NEON/GlobalPoolingLayer.cpp
index 7697806..9cf6a8b 100644
--- a/tests/validation/NEON/GlobalPoolingLayer.cpp
+++ b/tests/validation/NEON/GlobalPoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/HOGDescriptor.cpp b/tests/validation/NEON/HOGDescriptor.cpp
index 2966367..afaf8d9 100644
--- a/tests/validation/NEON/HOGDescriptor.cpp
+++ b/tests/validation/NEON/HOGDescriptor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/HOGDetector.cpp b/tests/validation/NEON/HOGDetector.cpp
index c787728..2036ceb 100644
--- a/tests/validation/NEON/HOGDetector.cpp
+++ b/tests/validation/NEON/HOGDetector.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/HOGMultiDetection.cpp b/tests/validation/NEON/HOGMultiDetection.cpp
index d6017e0..e15fe24 100644
--- a/tests/validation/NEON/HOGMultiDetection.cpp
+++ b/tests/validation/NEON/HOGMultiDetection.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/HarrisCorners.cpp b/tests/validation/NEON/HarrisCorners.cpp
index 23621cd..e4c0827 100644
--- a/tests/validation/NEON/HarrisCorners.cpp
+++ b/tests/validation/NEON/HarrisCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/HeightConcatenateLayer.cpp b/tests/validation/NEON/HeightConcatenateLayer.cpp
index bfb0a21..c46b797 100644
--- a/tests/validation/NEON/HeightConcatenateLayer.cpp
+++ b/tests/validation/NEON/HeightConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,7 +71,7 @@
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {

diff --git a/tests/validation/NEON/Histogram.cpp b/tests/validation/NEON/Histogram.cpp
index f0beb4e..03b2e2b 100644
--- a/tests/validation/NEON/Histogram.cpp
+++ b/tests/validation/NEON/Histogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp
index f4b2cc7..d218956 100644
--- a/tests/validation/NEON/Im2Col.cpp
+++ b/tests/validation/NEON/Im2Col.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/InstanceNormalizationLayer.cpp b/tests/validation/NEON/InstanceNormalizationLayer.cpp
index d2a80f2..1073a7f 100644
--- a/tests/validation/NEON/InstanceNormalizationLayer.cpp
+++ b/tests/validation/NEON/InstanceNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,11 @@
 /** Tolerance for float operations */
 AbsoluteTolerance<float> tolerance_f32(0.0015f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-AbsoluteTolerance<float> tolerance_f16(0.5f);
+// This precision is chosen based on the precision float16_t can provide
+// for the decimal numbers between 16 and 32 and decided based on multiple
+// times of execution of tests. Although, with randomly generated numbers
+// there is no gaurantee that this tolerance will be always large enough.
+AbsoluteTolerance<half> tolerance_f16(static_cast<half>(0.015625f));
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 } // namespace
 

diff --git a/tests/validation/NEON/IntegralImage.cpp b/tests/validation/NEON/IntegralImage.cpp
index 4408acd..2a8aa95 100644
--- a/tests/validation/NEON/IntegralImage.cpp
+++ b/tests/validation/NEON/IntegralImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/L2NormalizeLayer.cpp b/tests/validation/NEON/L2NormalizeLayer.cpp
index 17147c1..37146f4 100644
--- a/tests/validation/NEON/L2NormalizeLayer.cpp
+++ b/tests/validation/NEON/L2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -99,7 +99,7 @@
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       framework::dataset::make("Axis", { -1, 0, 2 })),
+                                       framework::dataset::make("Axis", { -1, 0, 1, 2 })),
                                framework::dataset::make("Epsilon", { 1e-12 })))
 {
     // Validate output
@@ -120,7 +120,7 @@
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEL2NormalizeLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                       framework::dataset::make("Axis", { -1, 0, 2 })),
+                                       framework::dataset::make("Axis", { -1, 0, 1, 2 })),
                                framework::dataset::make("Epsilon", { 1e-12 })))
 {
     // Validate output

diff --git a/tests/validation/NEON/LSTMLayer.cpp b/tests/validation/NEON/LSTMLayer.cpp
index 45beb36..0850dc6 100644
--- a/tests/validation/NEON/LSTMLayer.cpp
+++ b/tests/validation/NEON/LSTMLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -134,9 +134,10 @@
                input_info, input_weights_info, recurrent_weights_info, cell_bias_info, projection_bias_info, cell_state_info, output_info, scratch_info, info, expected)
 {
     LSTMParams<ITensorInfo> lstm_params_info;
-    lstm_params_info.set_peephole_params(&cell_bias_info, &cell_bias_info)
+    auto cell_bias_clone = cell_bias_info.clone();
+    lstm_params_info.set_peephole_params(cell_bias_clone.get(), cell_bias_clone.get())
                     .set_projection_params(&recurrent_weights_info, &projection_bias_info)
-                    .set_cifg_params(&input_weights_info, &recurrent_weights_info, &cell_bias_info, &cell_bias_info);
+                    .set_cifg_params(&input_weights_info, &recurrent_weights_info, cell_bias_clone.get(), cell_bias_clone.get());
 
     ARM_COMPUTE_EXPECT(bool(NELSTMLayer::validate(&input_info.clone()->set_is_resizable(false), &input_weights_info.clone()->set_is_resizable(false), &input_weights_info.clone()->set_is_resizable(false),
                                                   &input_weights_info.clone()->set_is_resizable(false), &recurrent_weights_info.clone()->set_is_resizable(false), &recurrent_weights_info.clone()->set_is_resizable(false),

diff --git a/tests/validation/NEON/LSTMLayerQuantized.cpp b/tests/validation/NEON/LSTMLayerQuantized.cpp
index b57a8f7..fbcece8 100644
--- a/tests/validation/NEON/LSTMLayerQuantized.cpp
+++ b/tests/validation/NEON/LSTMLayerQuantized.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/LaplacianPyramid.cpp b/tests/validation/NEON/LaplacianPyramid.cpp
index 456ae75..0c03c70 100644
--- a/tests/validation/NEON/LaplacianPyramid.cpp
+++ b/tests/validation/NEON/LaplacianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/LaplacianReconstruct.cpp b/tests/validation/NEON/LaplacianReconstruct.cpp
index b41449e..bc1151f 100644
--- a/tests/validation/NEON/LaplacianReconstruct.cpp
+++ b/tests/validation/NEON/LaplacianReconstruct.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/LocallyConnected.cpp b/tests/validation/NEON/LocallyConnected.cpp
index d58ca45..37c7752 100644
--- a/tests/validation/NEON/LocallyConnected.cpp
+++ b/tests/validation/NEON/LocallyConnected.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/LogSoftmaxLayer.cpp b/tests/validation/NEON/LogSoftmaxLayer.cpp
index e35c8fd..3f85e3f 100644
--- a/tests/validation/NEON/LogSoftmaxLayer.cpp
+++ b/tests/validation/NEON/LogSoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,23 +71,23 @@
 FIXTURE_DATA_TEST_CASE(RunSmall, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F16)),
                                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 1 })))
+                                                                                                            framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
-                                                                                                                      framework::dataset::make("DataType", DataType::F32)),
+                                                                                                                      framework::dataset::make("DataType", DataType::F16)),
                                                                                                                       framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                              framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                              framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
-    validate(Accessor(_target), _reference, tolerance_f32);
+    validate(Accessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                   framework::dataset::make("DataType", DataType::F16)),
                                                                                                                   framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                          framework::dataset::make("Axis", { 1 })))
+                                                                                                          framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -99,7 +99,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                               framework::dataset::make("Axis", { 1 })))
+                                                                                                               framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -107,7 +107,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F32)),
                                                                                                                        framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                               framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                               framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -115,7 +115,7 @@
 FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F32)),
                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 1 })))
+                                                                                                           framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -132,7 +132,7 @@
                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                     combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                    framework::dataset::make("Axis", { 1 })))
+                                                                                                                    framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -141,7 +141,7 @@
                                                                                                                     framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                     combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                             framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                    framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                                    framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -150,7 +150,7 @@
                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                       combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                               framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                      framework::dataset::make("Axis", { 1 })))
+                                                                                                                      framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);

diff --git a/tests/validation/NEON/Magnitude.cpp b/tests/validation/NEON/Magnitude.cpp
index 61acc90..e14b32a 100644
--- a/tests/validation/NEON/Magnitude.cpp
+++ b/tests/validation/NEON/Magnitude.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/MaxUnpoolingLayer.cpp b/tests/validation/NEON/MaxUnpoolingLayer.cpp
new file mode 100644
index 0000000..a33ec28
--- /dev/null
+++ b/tests/validation/NEON/MaxUnpoolingLayer.cpp

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/MaxUnpoolingLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(NEON)
+TEST_SUITE(PoolingLayer)
+
+template <typename T>
+using NEMaxUnpoolingLayerFixture = MaxUnpoolingLayerValidationFixture<Tensor, Accessor, NEPoolingLayer, NEMaxUnpoolingLayer, T>;
+
+const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2) })),
+                                                       framework::dataset::make("PadStride", { PadStrideInfo(2, 2, 0, 0), PadStrideInfo(2, 1, 0, 0) }));
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                   framework::dataset::make("DataType", DataType::F32))),
+                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
+
+                                                                                                                  ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // FP32
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(MaxUnpooling, NEMaxUnpoolingLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallShapes(), combine(PoolingLayerIndicesDatasetFPSmall,
+                                                                                                                  framework::dataset::make("DataType", DataType::F16))),
+                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })
+
+                                                                                                                 ))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // FP16
+#endif           /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+TEST_SUITE_END() // Float
+TEST_SUITE_END() // PoolingLayer
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/NEON/MeanStdDev.cpp b/tests/validation/NEON/MeanStdDev.cpp
index d8b3fd8..a109396 100644
--- a/tests/validation/NEON/MeanStdDev.cpp
+++ b/tests/validation/NEON/MeanStdDev.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
index 1327009..11f97a5 100644
--- a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Median3x3.cpp b/tests/validation/NEON/Median3x3.cpp
index 4b0f639..1924a44 100644
--- a/tests/validation/NEON/Median3x3.cpp
+++ b/tests/validation/NEON/Median3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/MinMaxLocation.cpp b/tests/validation/NEON/MinMaxLocation.cpp
index bc08666..973ea93 100644
--- a/tests/validation/NEON/MinMaxLocation.cpp
+++ b/tests/validation/NEON/MinMaxLocation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/NonLinearFilter.cpp b/tests/validation/NEON/NonLinearFilter.cpp
index 08ad1e2..c54394d 100644
--- a/tests/validation/NEON/NonLinearFilter.cpp
+++ b/tests/validation/NEON/NonLinearFilter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/NormalizationLayer.cpp b/tests/validation/NEON/NormalizationLayer.cpp
index 20dcafb..255a68d 100644
--- a/tests/validation/NEON/NormalizationLayer.cpp
+++ b/tests/validation/NEON/NormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,8 +61,6 @@
 TEST_SUITE(NEON)
 TEST_SUITE(NormalizationLayer)
 
-//TODO(COMPMID-415): Missing configuration?
-
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -70,24 +68,21 @@
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Mismatching shapes
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Even normalization
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Non implemented IN_MAP_2D
-                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Window shrink
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                           }),
     framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F16),
                                             TensorInfo(TensorShape(27U, 11U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
-                                            TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
                                             TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32),
                                           })),
     framework::dataset::make("NormInfo",  { NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                             NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                             NormalizationLayerInfo(NormType::IN_MAP_1D, 4),
                                             NormalizationLayerInfo(NormType::IN_MAP_2D, 5),
-                                            NormalizationLayerInfo(NormType::IN_MAP_1D, 5),
                                             NormalizationLayerInfo(NormType::CROSS_MAP, 1),
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, false, true, true })),
     input_info, output_info, norm_info, expected)
 {
     bool is_valid = bool(NENormalizationLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), norm_info));
@@ -96,6 +91,25 @@
 // clang-format on
 // *INDENT-ON*
 
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)),
+               shape, data_type)
+{
+    NormalizationLayerInfo info(NormType::IN_MAP_1D, 3U, 5.0f, 2.0f, 1.f, false);
+
+    // Create tensors
+    Tensor src = create_tensor<Tensor>(shape, data_type);
+    Tensor dst = create_tensor<Tensor>(shape, data_type);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    NENormalizationLayer norm;
+    norm.configure(&src, &dst, info);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
+
 template <typename T>
 using NENormalizationLayerFixture = NormalizationValidationFixture<Tensor, Accessor, NENormalizationLayer, T>;
 

diff --git a/tests/validation/NEON/OpticalFlow.cpp b/tests/validation/NEON/OpticalFlow.cpp
index 1f4bf5f..ba40d5f 100644
--- a/tests/validation/NEON/OpticalFlow.cpp
+++ b/tests/validation/NEON/OpticalFlow.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/PReluLayer.cpp b/tests/validation/NEON/PReluLayer.cpp
index bc93c65..c4b3c88 100644
--- a/tests/validation/NEON/PReluLayer.cpp
+++ b/tests/validation/NEON/PReluLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/PadLayer.cpp b/tests/validation/NEON/PadLayer.cpp
index 5049347..ea9ef72 100644
--- a/tests/validation/NEON/PadLayer.cpp
+++ b/tests/validation/NEON/PadLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Permute.cpp b/tests/validation/NEON/Permute.cpp
index 07578d3..d405582 100644
--- a/tests/validation/NEON/Permute.cpp
+++ b/tests/validation/NEON/Permute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Phase.cpp b/tests/validation/NEON/Phase.cpp
index 92ed9f7..37b04f4 100644
--- a/tests/validation/NEON/Phase.cpp
+++ b/tests/validation/NEON/Phase.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index 6a75b00..0b88628 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -64,21 +64,36 @@
                                                          framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE }),
                                                          framework::dataset::make("RoundingPolicy", { RoundingPolicy::TO_ZERO }));
 
+/** Tests for in-place computation
+ * With current interface storing TensorInfo with quantization information
+ * in the kernel, it is difficult to have different tensor metadata
+ * (e.g., quantization information, data type, different shape for broadcasting)
+ * when an input is used as the output of the computation.
+ * So, the following dataset for in-place computation is used only when
+ * the exact same input and output Tensor object makes sense
+ * (i.e., all the tensor metadata is the same) whereas if output is
+ * expected to have either different quantization information, data type
+ * or different shape we are not testing in-place computation.
+ */
+const auto InPlaceDataSet = framework::dataset::make("InPlace", { false, true });
+
 #define DEFAULT_VALIDATE validate(Accessor(_target), _reference);
 #define VALIDATE(TYPE, TOLERANCE) validate(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f);
 #define WRAP_VALIDATE(TYPE, TOLERANCE) validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f);
 
 // *INDENT-OFF*
 // clang-format off
-#define PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, SCALE, RP, VALIDATE) \
-    FIXTURE_DATA_TEST_CASE(TEST_NAME, NEPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE,                   \
-                           combine(combine(combine(combine(combine(                                                       \
+#define PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, DT3, SCALE, RP, INPLACE_DATASET, VALIDATE) \
+    FIXTURE_DATA_TEST_CASE(TEST_NAME, NEPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE,                        \
+                           combine(combine(combine(combine(combine(combine(combine(                                            \
                            datasets::SHAPES,                                                                              \
                            framework::dataset::make("DataType1", DataType::DT1)),                                         \
                            framework::dataset::make("DataType2", DataType::DT2)),                                         \
+                           framework::dataset::make("DataType3", DataType::DT3)),                                         \
                            framework::dataset::make("Scale", std::move(SCALE))),                                          \
                            datasets::ConvertPolicies()),                                                                  \
-                           framework::dataset::make("RoundingPolicy", RoundingPolicy::RP)))                               \
+                           framework::dataset::make("RoundingPolicy", RoundingPolicy::RP)),                               \
+                           (INPLACE_DATASET)))                                                                            \
     {                                                                                                                     \
         VALIDATE                                                                                                          \
     }
@@ -98,9 +113,12 @@
 template <typename T>
 using NEPixelWiseMultiplicationToF16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, half_float::half>;
 template <typename T>
-using NEPixelWiseMultiplicationToF32Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
+using NEPixelWiseMultiplicationToF32Fixture     = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
+using NEPixelWiseMultiplicationU8U8ToS16Fixture = PixelWiseMultiplicationValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t, int16_t>;
 template <typename T>
-using NEPixelWiseMultiplicationBroadcastFixture = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
+using NEPixelWiseMultiplicationBroadcastFixture              = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, float>;
+using NEPixelWiseMultiplicationBroadcastQASYMM8Fixture       = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t>;
+using NEPixelWiseMultiplicationBroadcastQASYMM8SignedFixture = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, int8_t, int8_t>;
 
 TEST_SUITE(NEON)
 TEST_SUITE(PixelWiseMultiplication)
@@ -179,7 +197,7 @@
                                                    ConvertPolicy::WRAP,
                                         })),
 
-               framework::dataset::make("Expected", { true, true, false, false, false, false, false, false, true , false, false, true, false })),
+               framework::dataset::make("Expected", { true, true, true, false, false, false, false, false, true , false, false, true, false })),
                input1_info, input2_info, output_info, scale, policy, expected)
 {
     bool has_error = bool(NEPixelWiseMultiplication::validate(&input1_info.clone()->set_is_resizable(false), &input2_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), scale, policy, RoundingPolicy::TO_ZERO));
@@ -188,16 +206,69 @@
 // clang-format on
 // *INDENT-ON*
 
+TEST_SUITE(InPlaceValidate)
+TEST_CASE(SingleTensor, framework::DatasetMode::ALL)
+{
+    const auto random_shape       = TensorShape{ 9, 9 };
+    const auto single_tensor_info = TensorInfo{ random_shape, 1, DataType::F32 };
+
+    Status result = NEPixelWiseMultiplication::validate(&single_tensor_info, &single_tensor_info, &single_tensor_info, scale_unity, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(ValidBroadCast, framework::DatasetMode::ALL)
+{
+    const auto larger_shape  = TensorShape{ 27U, 13U, 2U };
+    const auto smaller_shape = TensorShape{ 1U, 13U, 2U };
+
+    const auto larger_tensor_info  = TensorInfo{ larger_shape, 1, DataType::F32 };
+    const auto smaller_tensor_info = TensorInfo{ smaller_shape, 1, DataType::F32 };
+
+    Status result = NEPixelWiseMultiplication::validate(&larger_tensor_info, &smaller_tensor_info, &larger_tensor_info, scale_unity, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO);
+    ARM_COMPUTE_EXPECT(bool(result) == true, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(InvalidBroadcastOutput, framework::DatasetMode::ALL)
+{
+    const auto larger_shape  = TensorShape{ 27U, 13U, 2U };
+    const auto smaller_shape = TensorShape{ 1U, 13U, 2U };
+
+    const auto larger_tensor_info  = TensorInfo{ larger_shape, 1, DataType::F32 };
+    const auto smaller_tensor_info = TensorInfo{ smaller_shape, 1, DataType::F32 };
+
+    Status result = NEPixelWiseMultiplication::validate(&larger_tensor_info, &smaller_tensor_info, &smaller_tensor_info, scale_unity, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(InvalidBroadcastBoth, framework::DatasetMode::ALL)
+{
+    const auto shape0 = TensorShape{ 9U, 9U };
+    const auto shape1 = TensorShape{ 9U, 1U, 2U };
+
+    const auto info0 = TensorInfo{ shape0, 1, DataType::F32 };
+    const auto info1 = TensorInfo{ shape1, 1, DataType::F32 };
+
+    Status result{};
+
+    result = NEPixelWiseMultiplication::validate(&info0, &info1, &info0, scale_unity, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+
+    result = NEPixelWiseMultiplication::validate(&info0, &info1, &info1, scale_unity, ConvertPolicy::WRAP, RoundingPolicy::TO_ZERO);
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // InPlaceValidate
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8_SIGNED)
 TEST_SUITE(Scale255)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
-                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
-                       framework::dataset::make("DataTypeOut", DataType::QASYMM8_SIGNED)),
-                       framework::dataset::make("Scale", { scale_unity })),
-                       PixelWiseMultiplicationPolicySTZDataset),
-                       PixelWiseMultiplicationQASYMM8QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8SignedFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QASYMM8_SIGNED)),
+                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QASYMM8_SIGNED)),
+                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QASYMM8_SIGNED)),
+                                                                                                                     framework::dataset::make("Scale", { scale_unity })),
+                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                     PixelWiseMultiplicationQASYMM8QuantDataset),
+                                                                                                                     InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -207,146 +278,101 @@
 
 TEST_SUITE(QASYMM8)
 TEST_SUITE(Scale255)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("Scale", { scale_255 })),
-                                                                                                                     PixelWiseMultiplicationPolicySTNUDataset),
-                                                                                                                     PixelWiseMultiplicationQASYMM8QuantDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("Scale", { scale_255 })),
-                                                                                                                   PixelWiseMultiplicationPolicySTNUDataset),
-                                                                                                                   PixelWiseMultiplicationQASYMM8QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("Scale", { scale_255 })),
+                                                                                                                       PixelWiseMultiplicationPolicySTNUDataset),
+                                                                                                                       PixelWiseMultiplicationQASYMM8QuantDataset),
+                                                                                                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // Scale255
 TEST_SUITE(ScaleUnity)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("Scale", { scale_unity })),
-                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                     PixelWiseMultiplicationQASYMM8QuantDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("Scale", { scale_unity })),
-                                                                                                                   PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                   PixelWiseMultiplicationQASYMM8QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("Scale", { scale_unity })),
+                                                                                                                       PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                       PixelWiseMultiplicationQASYMM8QuantDataset),
+                                                                                                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // ScaleUnity
 TEST_SUITE(ScaleOther)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                     framework::dataset::make("Scale", { scale_other })),
-                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                     PixelWiseMultiplicationQASYMM8QuantDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qasymm8);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
-                                                                                                                   framework::dataset::make("Scale", { scale_other })),
-                                                                                                                   PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                   PixelWiseMultiplicationQASYMM8QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                                                                                       framework::dataset::make("Scale", { scale_other })),
+                                                                                                                       PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                       PixelWiseMultiplicationQASYMM8QuantDataset),
+                                                                                                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // ScaleOther
+TEST_SUITE(Broadcast)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationBroadcastQASYMM8Fixture, framework::DatasetMode::ALL,
+                       combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapesBroadcast(),
+                                                                               framework::dataset::make("DataTypeIn1", DataType::QASYMM8)),
+                                                                       framework::dataset::make("DataTypeIn2", DataType::QASYMM8)),
+                                                               framework::dataset::make("DataTypeOut", DataType::QASYMM8)),
+                                                       framework::dataset::make("Scale", { scale_other })),
+                                               PixelWiseMultiplicationPolicySTZDataset),
+                                       PixelWiseMultiplicationQASYMM8QuantDataset),
+                               framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // Broadcast
 TEST_SUITE_END() // QASYMM8
 TEST_SUITE(QSYMM16)
 TEST_SUITE(Scale255)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("Scale", { scale_255 })),
-                                                                                                                     PixelWiseMultiplicationPolicySTNUDataset),
-                                                                                                                     PixelWiseMultiplicationQSYMM16QuantDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qsymm16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("Scale", { scale_255 })),
-                                                                                                                   PixelWiseMultiplicationPolicySTNUDataset),
-                                                                                                                   PixelWiseMultiplicationQSYMM16QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("Scale", { scale_255 })),
+                                                                                                                       PixelWiseMultiplicationPolicySTNUDataset),
+                                                                                                                       PixelWiseMultiplicationQSYMM16QuantDataset),
+                                                                                                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
 }
 TEST_SUITE_END() // Scale255
 TEST_SUITE(ScaleUnity)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("Scale", { scale_unity })),
-                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                     PixelWiseMultiplicationQSYMM16QuantDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qsymm16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("Scale", { scale_unity })),
-                                                                                                                   PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                   PixelWiseMultiplicationQSYMM16QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("Scale", { scale_unity })),
+                                                                                                                       PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                       PixelWiseMultiplicationQSYMM16QuantDataset),
+                                                                                                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
 }
 TEST_SUITE_END() // ScaleUnity
 TEST_SUITE(ScaleOther)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                     framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("Scale", { scale_other })),
-                                                                                                                     PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                     PixelWiseMultiplicationQSYMM16QuantDataset))
-{
-    // Validate output
-    validate(Accessor(_target), _reference, tolerance_qsymm16);
-}
-FIXTURE_DATA_TEST_CASE(RunLarge, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(),
-                                                                                                                   framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
-                                                                                                                   framework::dataset::make("Scale", { scale_other })),
-                                                                                                                   PixelWiseMultiplicationPolicySTZDataset),
-                                                                                                                   PixelWiseMultiplicationQSYMM16QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::QSYMM16)),
+                                                                                                                       framework::dataset::make("Scale", { scale_other })),
+                                                                                                                       PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                       PixelWiseMultiplicationQSYMM16QuantDataset),
+                                                                                                               InPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -354,13 +380,14 @@
 TEST_SUITE_END() // ScaleOther
 TEST_SUITE_END() // QSYMM16
 TEST_SUITE(QSYMM16toS32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                       framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
-                       framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
-                       framework::dataset::make("DataTypeOut", DataType::S32)),
-                       framework::dataset::make("Scale", { scale_unity })),
-                       PixelWiseMultiplicationPolicySTZDataset),
-                       PixelWiseMultiplicationQSYMM16QuantDataset))
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                    framework::dataset::make("DataTypeIn1", DataType::QSYMM16)),
+                                                                                                                    framework::dataset::make("DataTypeIn2", DataType::QSYMM16)),
+                                                                                                                    framework::dataset::make("DataTypeOut", DataType::S32)),
+                                                                                                                    framework::dataset::make("Scale", { scale_unity })),
+                                                                                                                    PixelWiseMultiplicationPolicySTZDataset),
+                                                                                                                    PixelWiseMultiplicationQSYMM16QuantDataset),
+                                                                                                                    framework::dataset::make("InPlace", { false })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -368,21 +395,48 @@
 TEST_SUITE_END() // QSYMM16toS32
 TEST_SUITE_END() // Quantized
 
+TEST_SUITE(U8U8toS16)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationU8U8ToS16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                       framework::dataset::make("DataTypeIn1", DataType::U8)),
+                                                                                                                       framework::dataset::make("DataTypeIn2", DataType::U8)),
+                                                                                                                       framework::dataset::make("DataTypeOut", DataType::S16)),
+                                                                                                                       framework::dataset::make("Scale", { scale_255 })),
+                                                                                                                       datasets::ConvertPolicies()),
+                                                                                                                       framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_UP)),
+                                                                                                                       framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<int16_t>(1), 0.f);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall1, NEPixelWiseMultiplicationU8U8ToS16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                        framework::dataset::make("DataTypeIn1", DataType::U8)),
+                                                                                                                        framework::dataset::make("DataTypeIn2", DataType::U8)),
+                                                                                                                        framework::dataset::make("DataTypeOut", DataType::S16)),
+                                                                                                                        framework::dataset::make("Scale", { scale_other })),
+                                                                                                                        datasets::ConvertPolicies()),
+                                                                                                                        framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_ZERO)),
+                                                                                                                        framework::dataset::make("InPlace", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+TEST_SUITE_END() // U8U8toS16
+
 TEST_SUITE(U8toU8)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, U8, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(uint8_t, 1))
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToU8Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, U8, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(uint8_t, 1))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, U8, scale_255, TO_NEAREST_UP, InPlaceDataSet, WRAP_VALIDATE(uint8_t, 1))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, U8, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToU8Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, U8, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, U8, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, U8, scale_other, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToU8Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, U8, scale_other, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, U8, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // U8toU8
@@ -390,18 +444,18 @@
 TEST_SUITE(U8toS16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, S16, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(int16_t, 2))
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, S16, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(int16_t, 2))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_255, TO_NEAREST_UP, framework::dataset::make("InPlace", { false }),
+                                                 WRAP_VALIDATE(int16_t, 2))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, S16, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, S16, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_unity, TO_ZERO, framework::dataset::make("InPlace", { false }),
+                                                 DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, PRECOMMIT, SmallShapes(), U8, S16, scale_other, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<uint8_t>, NIGHTLY, LargeShapes(), U8, S16, scale_other, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_other, TO_ZERO, framework::dataset::make("InPlace", { false }),
+                                                 DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // U8toS16
@@ -409,18 +463,15 @@
 TEST_SUITE(S16toS16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, PRECOMMIT, SmallShapes(), S16, S16, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(int16_t, 2))
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<int16_t>, NIGHTLY, LargeShapes(), S16, S16, scale_255, TO_NEAREST_UP, WRAP_VALIDATE(int16_t, 2))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, S16, scale_255, TO_NEAREST_UP, InPlaceDataSet, WRAP_VALIDATE(int16_t, 2))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, PRECOMMIT, SmallShapes(), S16, S16, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<int16_t>, NIGHTLY, LargeShapes(), S16, S16, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, S16, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, PRECOMMIT, SmallShapes(), S16, S16, scale_other, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToS16Fixture<int16_t>, NIGHTLY, LargeShapes(), S16, S16, scale_other, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, S16, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // S16toS16
@@ -429,7 +480,7 @@
 TEST_SUITE(F16toF16)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF16Fixture<half_float::half>, PRECOMMIT, SmallShapes(), F16, F16, scale_255, TO_NEAREST_UP, VALIDATE(float, 1.f))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF16Fixture<half_float::half>, ALL, SmallShapes(), F16, F16, F16, scale_255, TO_NEAREST_UP, InPlaceDataSet, VALIDATE(float, 1.f))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE_END() // F16toF16
@@ -438,24 +489,22 @@
 TEST_SUITE(F32toF32)
 
 TEST_SUITE(Scale255)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, PRECOMMIT, SmallShapes(), F32, F32, scale_255, TO_NEAREST_UP, VALIDATE(float, 1.f))
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToF32Fixture<float>, NIGHTLY, LargeShapes(), F32, F32, scale_255, TO_NEAREST_UP, VALIDATE(float, 1.f))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, F32, scale_255, TO_NEAREST_UP, InPlaceDataSet, VALIDATE(float, 1.f))
 TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, PRECOMMIT, SmallShapes(), F32, F32, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToF32Fixture<float>, NIGHTLY, LargeShapes(), F32, F32, scale_unity, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, F32, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, PRECOMMIT, SmallShapes(), F32, F32, scale_other, TO_ZERO, DEFAULT_VALIDATE)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunLarge, ToF32Fixture<float>, NIGHTLY, LargeShapes(), F32, F32, scale_other, TO_ZERO, DEFAULT_VALIDATE)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToF32Fixture<float>, ALL, SmallShapes(), F32, F32, F32, scale_other, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
 TEST_SUITE_END() // ScaleOther
 
 TEST_SUITE_END() // F32toF32
 
 TEST_SUITE(Broadcast)
-PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<float>, PRECOMMIT, SmallShapesBroadcast(), F32, F32, scale_255, TO_NEAREST_UP, VALIDATE(float, 1.f))
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<float>, ALL, SmallShapesBroadcast(), F32, F32, F32, scale_255, TO_NEAREST_UP, framework::dataset::make("InPlace", { false }),
+                                                 VALIDATE(float, 1.f))
 TEST_SUITE_END() // Broadcast
 
 TEST_SUITE_END()

diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index 4b073d5..0a6a5a1 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/PriorBoxLayer.cpp b/tests/validation/NEON/PriorBoxLayer.cpp
index 5659b9f..cc63877 100644
--- a/tests/validation/NEON/PriorBoxLayer.cpp
+++ b/tests/validation/NEON/PriorBoxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/QLSTMLayerNormalization.cpp b/tests/validation/NEON/QLSTMLayerNormalization.cpp
index 3d71175..f3cd5fb 100644
--- a/tests/validation/NEON/QLSTMLayerNormalization.cpp
+++ b/tests/validation/NEON/QLSTMLayerNormalization.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/QuantizationLayer.cpp b/tests/validation/NEON/QuantizationLayer.cpp
index a5372b8..0156be2 100644
--- a/tests/validation/NEON/QuantizationLayer.cpp
+++ b/tests/validation/NEON/QuantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/RNNLayer.cpp b/tests/validation/NEON/RNNLayer.cpp
index a5f8499..7a3117a 100644
--- a/tests/validation/NEON/RNNLayer.cpp
+++ b/tests/validation/NEON/RNNLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ROIAlignLayer.cpp b/tests/validation/NEON/ROIAlignLayer.cpp
index b5deb01..3f6c9d2 100644
--- a/tests/validation/NEON/ROIAlignLayer.cpp
+++ b/tests/validation/NEON/ROIAlignLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Range.cpp b/tests/validation/NEON/Range.cpp
index 06351c8..57e891d 100644
--- a/tests/validation/NEON/Range.cpp
+++ b/tests/validation/NEON/Range.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp
index 821171a..23229a0 100644
--- a/tests/validation/NEON/ReduceMean.cpp
+++ b/tests/validation/NEON/ReduceMean.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ReductionOperation.cpp b/tests/validation/NEON/ReductionOperation.cpp
index f155e97..47b36c6 100644
--- a/tests/validation/NEON/ReductionOperation.cpp
+++ b/tests/validation/NEON/ReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@
 AbsoluteTolerance<float> tolerance_f32(0.0001f);
 RelativeTolerance<float> rel_tolerance_f32(0.0001f);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-AbsoluteTolerance<float> tolerance_f16(0.1f);
+AbsoluteTolerance<float> tolerance_f16(0.2f);
 RelativeTolerance<float> rel_tolerance_f16(0.1f);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 /** Tolerance for quantized operations */

diff --git a/tests/validation/NEON/Remap.cpp b/tests/validation/NEON/Remap.cpp
index 2e54b11..f8d7a25 100644
--- a/tests/validation/NEON/Remap.cpp
+++ b/tests/validation/NEON/Remap.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ReorgLayer.cpp b/tests/validation/NEON/ReorgLayer.cpp
index 5a76315..e79a671 100644
--- a/tests/validation/NEON/ReorgLayer.cpp
+++ b/tests/validation/NEON/ReorgLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/ReshapeLayer.cpp b/tests/validation/NEON/ReshapeLayer.cpp
index 3d1e0e6..bf39c39 100644
--- a/tests/validation/NEON/ReshapeLayer.cpp
+++ b/tests/validation/NEON/ReshapeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Reverse.cpp b/tests/validation/NEON/Reverse.cpp
index 2f3f69a..4278f35 100644
--- a/tests/validation/NEON/Reverse.cpp
+++ b/tests/validation/NEON/Reverse.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index b7d7b00..9d9a282 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,13 +28,9 @@
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
-#include "tests/datasets/BorderModeDataset.h"
-#include "tests/datasets/InterpolationPolicyDataset.h"
-#include "tests/datasets/SamplingPolicyDataset.h"
-#include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/ScaleValidationDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
-#include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ScaleFixture.h"
@@ -47,6 +43,25 @@
 {
 namespace
 {
+using datasets::ScaleShapesBaseDataSet;
+using datasets::ScaleInterpolationPolicySet;
+using datasets::ScaleDataLayouts;
+using datasets::ScaleSamplingPolicySet;
+using datasets::ScaleAlignCornersSamplingPolicySet;
+
+/** We consider vector size in byte 64 since the maximum size of
+ * a vector used by @ref NEScaleKernel is currently 64-byte (float32x4x4).
+ * There are possibility to reduce test time further by using
+ * smaller vector sizes for different data types where applicable.
+ */
+constexpr uint32_t vector_byte = 64;
+
+template <typename T>
+constexpr uint32_t num_elements_per_vector()
+{
+    return vector_byte / sizeof(T);
+}
+
 /** Scale data types */
 const auto ScaleDataTypes = framework::dataset::make("DataType",
 {
@@ -55,18 +70,10 @@
     DataType::F32,
 });
 
-/** Scale data types */
-const auto ScaleDataLayouts = framework::dataset::make("DataLayout",
+/** Quantization information data set */
+const auto QuantizationInfoSet = framework::dataset::make("QuantizationInfo",
 {
-    DataLayout::NCHW,
-    DataLayout::NHWC,
-});
-
-/** Align corners */
-const auto AlignCorners = framework::dataset::make("AlignCorners",
-{
-    false,
-    true,
+    QuantizationInfo(0.5f, -10),
 });
 
 /** Tolerance */
@@ -83,123 +90,162 @@
 
 TEST_SUITE(NEON)
 TEST_SUITE(Scale)
+TEST_SUITE(Validate)
 
-// *INDENT-OFF*
-// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
-        framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),  // Mismatching data type
-                                                TensorInfo(TensorShape(4U, 27U, 13U), 1, DataType::F32), // Invalid policy
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Insufficient padding
-                                                TensorInfo(TensorShape(4U, 27U, 13U), 1, DataType::F32),
-                                              }),
-        framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(132U, 25U, 2U), 1, DataType::F32),
-                                                TensorInfo(TensorShape(4U, 132U, 25U), 1, DataType::F32),
-                                                TensorInfo(TensorShape(132U, 25U, 2U), 1, DataType::F32),
-                                                TensorInfo(TensorShape(4U, 132U, 25U), 1, DataType::F32),
-                                              })),
-        framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                          InterpolationPolicy::AREA,
-                                                          InterpolationPolicy::AREA,
-                                                          InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                        })),
-        framework::dataset::make("BorderMode",  { BorderMode::UNDEFINED,
-                                                  BorderMode::UNDEFINED,
-                                                  BorderMode::UNDEFINED,
-                                                  BorderMode::REPLICATE,
-                                                })),
-        framework::dataset::make("SamplingPolicy",  { SamplingPolicy::CENTER,
-                                                      SamplingPolicy::CENTER,
-                                                      SamplingPolicy::CENTER,
-                                                      SamplingPolicy::CENTER,
-                                                    })),
-        framework::dataset::make("DataLayout",  { DataLayout::NCHW,
-                                                  DataLayout::NHWC,
-                                                  DataLayout::NCHW,
-                                                  DataLayout::NHWC,
-                                                })),
-        framework::dataset::make("Expected", { false, false, false ,true })),
-        input_info, output_info, policy,border_mode, sampling_policy, data_layout, expected)
+/** Validate test suite is to test ARM_COMPUTE_RETURN_ON_* macros
+ * we use to check the validity of given arguments in @ref NEScale
+ * and subsequent call to @ref NEScaleKernel.
+ * Since this is using validate() of @ref NEScale, which pre-adjust
+ * arguments for @ref NEScaleKernel, the following conditions in
+ * the kernel are not currently tested.
+ * - The same input and output
+ * - Data type of offset, dx and dy
+ * This suite also tests two different validate() APIs - one is
+ * using @ref ScaleKernelInfo and the other one is more verbose
+ * one calls the other one - in the same test case. Even though
+ * there are possibility that it makes debugging for regression
+ * harder, belows are reasons of this test case implementation.
+ * - The more verbose one is just a wrapper function calls
+ *   the other one without any additional logic. So we are
+ *   safe to merge two tests into one.
+ * - A large amount of code duplication is test suite can be prevented.
+ */
+
+const auto input_shape  = TensorShape{ 2, 3, 3, 2 };
+const auto output_shape = TensorShape{ 4, 6, 3, 2 };
+
+constexpr auto default_data_type            = DataType::U8;
+constexpr auto default_data_layout          = DataLayout::NHWC;
+constexpr auto default_interpolation_policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+constexpr auto default_border_mode          = BorderMode::UNDEFINED;
+constexpr auto default_sampling_policy      = SamplingPolicy::CENTER;
+constexpr bool default_use_padding          = false;
+
+TEST_CASE(NullPtr, framework::DatasetMode::ALL)
 {
-    const PixelValue constant_border(5);
-    Status status = NEScale::validate(&input_info.clone()->set_is_resizable(false).set_data_layout(data_layout),
-                                           &output_info.clone()->set_is_resizable(false).set_data_layout(data_layout),
-                                           policy, border_mode, constant_border, sampling_policy);
-    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
-}
-// clang-format on
-// *INDENT-ON*
+    const auto input  = TensorInfo{ input_shape, 1, default_data_type, default_data_layout };
+    const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout };
+    Status     result{};
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallShapes(), ScaleDataTypes), ScaleDataLayouts),
-                                                                                   framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                           datasets::BorderModes()),
-                                                                   framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })),
-               shape, data_type, data_layout, policy, border_mode, sampling_policy)
+    // nullptr is given as input
+    result = NEScale::validate(nullptr, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+
+    // nullptr is given as output
+    result = NEScale::validate(&input, nullptr, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(SupportDataType, framework::DatasetMode::ALL)
 {
-    std::mt19937                          generator(library->seed());
-    std::uniform_real_distribution<float> distribution_float(0.25, 2);
-    const float                           scale_x               = distribution_float(generator);
-    const float                           scale_y               = distribution_float(generator);
-    uint8_t                               constant_border_value = 0;
-    TensorShape                           src_shape             = shape;
-    if(border_mode == BorderMode::CONSTANT)
+    const std::map<DataType, bool> supported_data_types =
     {
-        std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-        constant_border_value = distribution_u8(generator);
-    }
-
-    // Get width/height indices depending on layout
-    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    // Change shape in case of NHWC.
-    if(data_layout == DataLayout::NHWC)
+        { DataType::U8, true },
+        { DataType::S8, false },
+        { DataType::QSYMM8, false },
+        { DataType::QASYMM8, true },
+        { DataType::QASYMM8_SIGNED, true },
+        { DataType::QSYMM8_PER_CHANNEL, false },
+        { DataType::U16, false },
+        { DataType::S16, true },
+        { DataType::QSYMM16, false },
+        { DataType::QASYMM16, false },
+        { DataType::U32, false },
+        { DataType::S32, false },
+        { DataType::U64, false },
+        { DataType::S64, false },
+        { DataType::BFLOAT16, false },
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        { DataType::F16, true },
+#else  // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        { DataType::F16, false },
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        { DataType::F32, true },
+        { DataType::F64, false },
+        { DataType::SIZET, false },
+    };
+    Status result{};
+    for(auto &kv : supported_data_types)
     {
-        permute(src_shape, PermutationVector(2U, 0U, 1U));
+        const auto input  = TensorInfo{ input_shape, 1, kv.first, default_data_layout };
+        const auto output = TensorInfo{ output_shape, 1, kv.first, default_data_layout };
+
+        result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+        ARM_COMPUTE_EXPECT(bool(result) == kv.second, framework::LogLevel::ERRORS);
     }
-
-    // Calculate scaled shape
-    TensorShape shape_scaled(src_shape);
-    shape_scaled.set(idx_width, src_shape[idx_width] * scale_x);
-    shape_scaled.set(idx_height, src_shape[idx_height] * scale_y);
-
-    // Create tensors
-    Tensor src = create_tensor<Tensor>(src_shape, data_type, 1, QuantizationInfo(), data_layout);
-    Tensor dst = create_tensor<Tensor>(shape_scaled, data_type, 1, QuantizationInfo(), data_layout);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEScale nescale;
-    nescale.configure(&src, &dst, policy, border_mode, constant_border_value, sampling_policy);
-
-    // Validate valid region
-    const ValidRegion dst_valid_region = calculate_valid_region_scale(*(src.info()), shape_scaled, policy, sampling_policy, (border_mode == BorderMode::UNDEFINED));
-    validate(dst.info()->valid_region(), dst_valid_region);
-
-    // Validate padding
-    int num_elements_processed_x = 16;
-    if(data_layout == DataLayout::NHWC)
-    {
-        num_elements_processed_x = (policy == InterpolationPolicy::BILINEAR) ? 1 : 16 / src.info()->element_size();
-    }
-    PaddingCalculator calculator(shape_scaled.x(), num_elements_processed_x);
-    calculator.set_border_mode(border_mode);
-
-    PaddingSize read_padding(1);
-    if(data_layout == DataLayout::NHWC)
-    {
-        read_padding = calculator.required_padding(PaddingCalculator::Option::EXCLUDE_BORDER);
-        if(border_mode != BorderMode::REPLICATE && policy == InterpolationPolicy::BILINEAR)
-        {
-            read_padding.top = 1;
-        }
-    }
-    const PaddingSize write_padding = calculator.required_padding(PaddingCalculator::Option::EXCLUDE_BORDER);
-    validate(src.info()->padding(), read_padding);
-    validate(dst.info()->padding(), write_padding);
 }
 
+TEST_CASE(MissmatchingDataType, framework::DatasetMode::ALL)
+{
+    constexpr auto non_default_data_type = DataType::F32;
+
+    const auto input  = TensorInfo{ input_shape, 1, default_data_type, default_data_layout };
+    const auto output = TensorInfo{ output_shape, 1, non_default_data_type, default_data_layout };
+    Status     result{};
+
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(UsePadding, framework::DatasetMode::ALL)
+{
+    const auto input  = TensorInfo{ input_shape, 1, default_data_type, default_data_layout };
+    const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout };
+    Status     result{};
+
+    // When use padding is false, border mode should be constant
+    constexpr auto border_mode = BorderMode::UNDEFINED;
+    constexpr bool use_padding = false;
+
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, border_mode, PixelValue(), default_sampling_policy, use_padding });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(AreaWithNHWC, framework::DatasetMode::ALL)
+{
+    // InterpolationPolicy::AREA is not supported for NHWC
+    constexpr auto interpolation_policy = InterpolationPolicy::AREA;
+    constexpr auto data_layout          = DataLayout::NHWC;
+
+    const auto input  = TensorInfo{ input_shape, 1, default_data_type, data_layout };
+    const auto output = TensorInfo{ output_shape, 1, default_data_type, data_layout };
+    Status     result{};
+
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(AreaWithNonU8, framework::DatasetMode::ALL)
+{
+    // InterpolationPolicy::AREA only supports U8
+    constexpr auto interpolation_policy = InterpolationPolicy::AREA;
+    constexpr auto data_type            = DataType::F32;
+    constexpr auto data_layout          = DataLayout::NCHW;
+
+    const auto input  = TensorInfo{ input_shape, 1, data_type, data_layout };
+    const auto output = TensorInfo{ output_shape, 1, data_type, data_layout };
+    Status     result{};
+
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(AlignedCornerNotSupported, framework::DatasetMode::ALL)
+{
+    // Aligned corners require sampling policy to be TOP_LEFT.
+    constexpr auto interpolation_policy = InterpolationPolicy::BILINEAR;
+    constexpr bool align_corners        = true;
+    constexpr auto sampling_policy      = SamplingPolicy::CENTER;
+
+    const auto input  = TensorInfo{ input_shape, 1, default_data_type, default_data_layout };
+    const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout };
+    Status     result{};
+
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), sampling_policy, default_use_padding, align_corners });
+    ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
+}
+TEST_SUITE_END() // Validate
+
 template <typename T>
 using NEScaleFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T>;
 template <typename T>
@@ -207,13 +253,8 @@
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                     DataType::F32)),
-                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                     framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                             datasets::BorderModes()),
-                                                                                                     framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })),
-                                                                                             AlignCorners))
+const auto f32_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<float>())), framework::dataset::make("DataType", DataType::F32));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -222,13 +263,7 @@
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_f32, tolerance_num_f32);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                 DataType::F32)),
-                                                                                                                 framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                 framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                 datasets::BorderModes()),
-                                                                                                         framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })),
-                                                                                                 AlignCorners))
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<float>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f32_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -240,13 +275,8 @@
 TEST_SUITE_END() // FP32
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                    DataType::F16)),
-                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                            datasets::BorderModes()),
-                                                                                                    framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })),
-                                                                                            AlignCorners))
+const auto f16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<half>())), framework::dataset::make("DataType", DataType::F16));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -255,13 +285,7 @@
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_f16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                        DataType::F16)),
-                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                        framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                datasets::BorderModes()),
-                                                                                                        framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })),
-                                                                                                AlignCorners))
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<half>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(f16_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -276,13 +300,8 @@
 
 TEST_SUITE(Integer)
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                       DataType::U8)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                               datasets::BorderModes()),
-                                                                                                       framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })),
-                                                                                               AlignCorners))
+const auto u8_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<uint8_t>())), framework::dataset::make("DataType", DataType::U8));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(u8_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -291,13 +310,7 @@
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                   DataType::U8)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                   framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                   datasets::BorderModes()),
-                                                                                                           framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })),
-                                                                                                   AlignCorners))
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(u8_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -308,13 +321,8 @@
 }
 TEST_SUITE_END() // U8
 TEST_SUITE(S16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
-                                                                                                                       DataType::S16)),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                               datasets::BorderModes()),
-                                                                                                       framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })),
-                                                                                               AlignCorners))
+const auto s16_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int16_t>())), framework::dataset::make("DataType", DataType::S16));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape, ScaleSamplingPolicySet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -323,13 +331,7 @@
     // Validate output
     validate(Accessor(_target), _reference, valid_region, tolerance_s16, tolerance_num_s16);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEScaleFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
-                                                                                                                   DataType::S16)),
-                                                                                                                   framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                   framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                   datasets::BorderModes()),
-                                                                                                           framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })),
-                                                                                                   AlignCorners))
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleFixture<int16_t>, framework::DatasetMode::ALL, ASSEMBLE_DATASET(s16_shape, ScaleAlignCornersSamplingPolicySet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -343,14 +345,18 @@
 
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) })),
-                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                        framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                        datasets::BorderModes()),
-                                                                                                                framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })),
-                                                                                                        AlignCorners))
+const auto qasymm8_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<uint8_t>())), framework::dataset::make("DataType", DataType::QASYMM8));
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleSamplingPolicySet, QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_shape, ScaleAlignCornersSamplingPolicySet,
+                       QuantizationInfoSet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -360,23 +366,27 @@
     validate(Accessor(_target), _reference, valid_region, tolerance_u8);
 }
 TEST_SUITE_END() // QASYMM8
-
 TEST_SUITE(QASYMM8_SIGNED)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
-                                                                                                                       framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                                                                                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) })),
-                                                                                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                                                                                                                       framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
-                                                                                                                       datasets::BorderModes()),
-                                                                                                               framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })),
-                                                                                                       AlignCorners))
+const auto                          qasymm8_signed_shape = combine((SCALE_SHAPE_DATASET(num_elements_per_vector<int8_t>())), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED));
+constexpr AbsoluteTolerance<int8_t> tolerance_qasymm8_signed{ 1 };
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleSamplingPolicySet, QuantizationInfoSet))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
     ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
 
     // Validate output
-    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+    validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallAlignCorners, NEScaleQuantizedFixture<int8_t>, framework::DatasetMode::ALL, ASSEMBLE_QUANTIZED_DATASET(qasymm8_signed_shape, ScaleAlignCornersSamplingPolicySet,
+                       QuantizationInfoSet))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_qasymm8_signed);
 }
 TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() // Quantized

diff --git a/tests/validation/NEON/Schaar.cpp b/tests/validation/NEON/Schaar.cpp
index 0b96eee..85a85cc 100644
--- a/tests/validation/NEON/Schaar.cpp
+++ b/tests/validation/NEON/Schaar.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Select.cpp b/tests/validation/NEON/Select.cpp
index 9ac7a6f..4fe422b 100644
--- a/tests/validation/NEON/Select.cpp
+++ b/tests/validation/NEON/Select.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Slice.cpp b/tests/validation/NEON/Slice.cpp
index 69506a5..1b35bfa 100644
--- a/tests/validation/NEON/Slice.cpp
+++ b/tests/validation/NEON/Slice.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Sobel.cpp b/tests/validation/NEON/Sobel.cpp
index c9ff7d6..2765057 100644
--- a/tests/validation/NEON/Sobel.cpp
+++ b/tests/validation/NEON/Sobel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp
index 8af3847..70203d9 100644
--- a/tests/validation/NEON/SoftmaxLayer.cpp
+++ b/tests/validation/NEON/SoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -73,7 +73,9 @@
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 12)),
-                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis value
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
+                                                                  QuantizationInfo(1.f/256, 12)),
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
                                                                   QuantizationInfo(1.f/256, 12)),
                                                       }),
                framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
@@ -85,6 +87,8 @@
                                                                   QuantizationInfo(1.f/256, 0)),
                                                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
                                                                   QuantizationInfo(1.f/256, 0)),
+                                                       TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                                                  QuantizationInfo(1.f/256, 0)),
                                                      })),
                framework::dataset::make("beta", { 1.0,
                                                   2.0,
@@ -94,14 +98,15 @@
                                                   2.0,
                                                   1.0,
                                                 })),
-               framework::dataset::make("axis", { 1,
-                                                  1,
-                                                  1,
-                                                  -1,
-                                                  1,
+               framework::dataset::make("axis", { 0,
+                                                  0,
+                                                  0,
+                                                  0,
+                                                  0,
+                                                  2,
                                                   -3,
                                                 })),
-               framework::dataset::make("Expected", { false, false, false, true, true, false })),
+               framework::dataset::make("Expected", { false, false, false, true, true, false, false })),
                input_info, output_info, beta, axis, expected)
 {
     ARM_COMPUTE_EXPECT(bool(NESoftmaxLayer::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
@@ -118,7 +123,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                  framework::dataset::make("DataType", DataType::F16)),
                                                                                                                  framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                         framework::dataset::make("Axis", { 1 })))
+                                                                                                         framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -126,7 +131,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                    framework::dataset::make("DataType", DataType::F16)),
                                                                                                                    framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                           framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                           framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -134,7 +139,7 @@
 FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                        framework::dataset::make("DataType", DataType::F16)),
                                                                                                                framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                       framework::dataset::make("Axis", { 1 })))
+                                                                                                       framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f16);
@@ -146,7 +151,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall2D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SoftmaxLayerSmallShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
                                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 1 })))
+                                                                                                            framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -154,7 +159,7 @@
 FIXTURE_DATA_TEST_CASE(RunSmall4D, NESoftmaxLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small4DShapes(),
                                                                                                                     framework::dataset::make("DataType", DataType::F32)),
                                                                                                                     framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                            framework::dataset::make("Axis", { 1, 2, 3 })))
+                                                                                                            framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -162,7 +167,7 @@
 FIXTURE_DATA_TEST_CASE(RunLarge, NESoftmaxLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::SoftmaxLayerLargeShapes(),
                                                                                                                         framework::dataset::make("DataType", DataType::F32)),
                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.0f })),
-                                                                                                        framework::dataset::make("Axis", { 1 })))
+                                                                                                        framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
@@ -179,7 +184,7 @@
                                                                                                                  framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                  combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                          framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { 1 })))
+                                                                                                                 framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -188,7 +193,7 @@
                                                                                                                  framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                  combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                          framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                 framework::dataset::make("Axis", { -1, 2, 3 })))
+                                                                                                                 framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -197,7 +202,7 @@
                                                                                                                    framework::dataset::make("DataType", DataType::QASYMM8)),
                                                                                                                    combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                            framework::dataset::make("Beta", { 1.0f, 2.0f }))),
-                                                                                                                   framework::dataset::make("Axis", { 1 })))
+                                                                                                                   framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
@@ -209,7 +214,7 @@
                                                                                                                         framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                         combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { -1, 1 })))
+                                                                                                                framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
@@ -218,7 +223,7 @@
                                                                                                                         framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                                                                                                                         combine(framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
                                                                                                                                 framework::dataset::make("Beta", { 1.0f, 2.f }))),
-                                                                                                                framework::dataset::make("Axis", { -2, 2, 3 })))
+                                                                                                                framework::dataset::make("Axis", { 0 })))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8_signed);

diff --git a/tests/validation/NEON/SpaceToBatchLayer.cpp b/tests/validation/NEON/SpaceToBatchLayer.cpp
index fc8a800..d9fea97 100644
--- a/tests/validation/NEON/SpaceToBatchLayer.cpp
+++ b/tests/validation/NEON/SpaceToBatchLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/SpaceToDepthLayer.cpp b/tests/validation/NEON/SpaceToDepthLayer.cpp
index 8d27261..2cdfb06 100644
--- a/tests/validation/NEON/SpaceToDepthLayer.cpp
+++ b/tests/validation/NEON/SpaceToDepthLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Split.cpp b/tests/validation/NEON/Split.cpp
index 5ad19a6..a80f9ac 100644
--- a/tests/validation/NEON/Split.cpp
+++ b/tests/validation/NEON/Split.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/StackLayer.cpp b/tests/validation/NEON/StackLayer.cpp
index fbe4537..9ba709a 100644
--- a/tests/validation/NEON/StackLayer.cpp
+++ b/tests/validation/NEON/StackLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/StridedSlice.cpp b/tests/validation/NEON/StridedSlice.cpp
index f94aed0..91d5a64 100644
--- a/tests/validation/NEON/StridedSlice.cpp
+++ b/tests/validation/NEON/StridedSlice.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/TableLookup.cpp b/tests/validation/NEON/TableLookup.cpp
index 0eae2ea..647c486 100644
--- a/tests/validation/NEON/TableLookup.cpp
+++ b/tests/validation/NEON/TableLookup.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Threshold.cpp b/tests/validation/NEON/Threshold.cpp
index 7cddf7c..917a8a2 100644
--- a/tests/validation/NEON/Threshold.cpp
+++ b/tests/validation/NEON/Threshold.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,16 +53,15 @@
 
     // Create and configure function
     NEThreshold thrsh;
-    thrsh.configure(&src, &dst, threshold, false_value, true_value, type, upper);
+    thrsh.configure(&src, &dst, ThresholdKernelInfo(threshold, false_value, true_value, type, upper));
 
     // Validate valid region
     const ValidRegion valid_region = shape_to_valid_region(shape);
     validate(dst.info()->valid_region(), valid_region);
 
     // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
-    validate(src.info()->padding(), padding);
-    validate(dst.info()->padding(), padding);
+    validate(src.info()->padding(), PaddingSize());
+    validate(dst.info()->padding(), PaddingSize());
 }
 
 template <typename T>

diff --git a/tests/validation/NEON/Tile.cpp b/tests/validation/NEON/Tile.cpp
index d033f59..aa96c95 100644
--- a/tests/validation/NEON/Tile.cpp
+++ b/tests/validation/NEON/Tile.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Transpose.cpp b/tests/validation/NEON/Transpose.cpp
index 1f38fcc..5c77128 100644
--- a/tests/validation/NEON/Transpose.cpp
+++ b/tests/validation/NEON/Transpose.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/UNIT/DynamicTensor.cpp b/tests/validation/NEON/UNIT/DynamicTensor.cpp
index 55ade8c..464633e 100644
--- a/tests/validation/NEON/UNIT/DynamicTensor.cpp
+++ b/tests/validation/NEON/UNIT/DynamicTensor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/UNIT/MemoryManager.cpp b/tests/validation/NEON/UNIT/MemoryManager.cpp
index 8defc7a..83a9fcb 100644
--- a/tests/validation/NEON/UNIT/MemoryManager.cpp
+++ b/tests/validation/NEON/UNIT/MemoryManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/UNIT/RuntimeContext.cpp b/tests/validation/NEON/UNIT/RuntimeContext.cpp
index 0823085..6070a88 100644
--- a/tests/validation/NEON/UNIT/RuntimeContext.cpp
+++ b/tests/validation/NEON/UNIT/RuntimeContext.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/UNIT/TensorAllocator.cpp b/tests/validation/NEON/UNIT/TensorAllocator.cpp
index 21e4e71..273d2e0 100644
--- a/tests/validation/NEON/UNIT/TensorAllocator.cpp
+++ b/tests/validation/NEON/UNIT/TensorAllocator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Unstack.cpp b/tests/validation/NEON/Unstack.cpp
index 6be80df..aa8dcc5 100644
--- a/tests/validation/NEON/Unstack.cpp
+++ b/tests/validation/NEON/Unstack.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/Upsample.cpp b/tests/validation/NEON/Upsample.cpp
index 50aadef..221f690 100644
--- a/tests/validation/NEON/Upsample.cpp
+++ b/tests/validation/NEON/Upsample.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/WarpAffine.cpp b/tests/validation/NEON/WarpAffine.cpp
index 4339d02..ce5360b 100644
--- a/tests/validation/NEON/WarpAffine.cpp
+++ b/tests/validation/NEON/WarpAffine.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/WarpPerspective.cpp b/tests/validation/NEON/WarpPerspective.cpp
index 2f21913..d146bda 100644
--- a/tests/validation/NEON/WarpPerspective.cpp
+++ b/tests/validation/NEON/WarpPerspective.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/NEON/WidthConcatenateLayer.cpp b/tests/validation/NEON/WidthConcatenateLayer.cpp
index 3edf136..123a772 100644
--- a/tests/validation/NEON/WidthConcatenateLayer.cpp
+++ b/tests/validation/NEON/WidthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,7 +69,7 @@
     inputs_vector_info.emplace_back(std::move(input_info1));
     inputs_vector_info.emplace_back(std::move(input_info2));
 
-    std::vector<ITensorInfo *> inputs_vector_info_raw;
+    std::vector<const ITensorInfo *> inputs_vector_info_raw;
     inputs_vector_info_raw.reserve(inputs_vector_info.size());
     for(auto &input : inputs_vector_info)
     {

diff --git a/tests/validation/NEON/YOLOLayer.cpp b/tests/validation/NEON/YOLOLayer.cpp
index 0edbc4e..4806122 100644
--- a/tests/validation/NEON/YOLOLayer.cpp
+++ b/tests/validation/NEON/YOLOLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/UNIT/GPUTarget.cpp b/tests/validation/UNIT/GPUTarget.cpp
index c9d7866..5d8c63b 100644
--- a/tests/validation/UNIT/GPUTarget.cpp
+++ b/tests/validation/UNIT/GPUTarget.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/UNIT/LifetimeManager.cpp b/tests/validation/UNIT/LifetimeManager.cpp
index 44a52aa..5345cba 100644
--- a/tests/validation/UNIT/LifetimeManager.cpp
+++ b/tests/validation/UNIT/LifetimeManager.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/UNIT/SafeIntegerOps.cpp b/tests/validation/UNIT/SafeIntegerOps.cpp
index e721c48..62f7041 100644
--- a/tests/validation/UNIT/SafeIntegerOps.cpp
+++ b/tests/validation/UNIT/SafeIntegerOps.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/UNIT/SubTensorInfo.cpp b/tests/validation/UNIT/SubTensorInfo.cpp
new file mode 100644
index 0000000..5a93062
--- /dev/null
+++ b/tests/validation/UNIT/SubTensorInfo.cpp

@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/SubTensorInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(UNIT)
+TEST_SUITE(SubTensorInfo)
+
+/** Validate sub-tensor creation
+ *
+ * Test performed:
+ *
+ *  - Negative testing on X indexing
+ *  - Negative testing on Y indexing
+ *  - Positive testing by indexing on X,Y indexing
+ * */
+TEST_CASE(SubTensorCreation, framework::DatasetMode::ALL)
+{
+    // Create tensor info
+    TensorInfo info(TensorShape(23U, 17U, 3U), 1, DataType::F32);
+
+    // Negative testing on X
+    ARM_COMPUTE_EXPECT_THROW(SubTensorInfo(&info, TensorShape(13U, 17U, 3U), Coordinates(24, 0, 0)), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT_THROW(SubTensorInfo(&info, TensorShape(13U, 17U, 3U), Coordinates(15, 0, 0)), framework::LogLevel::ERRORS);
+
+    // Negative testing on Y
+    ARM_COMPUTE_EXPECT_THROW(SubTensorInfo(&info, TensorShape(23U, 8U, 3U), Coordinates(0, 18, 0)), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT_THROW(SubTensorInfo(&info, TensorShape(23U, 8U, 3U), Coordinates(0, 13, 0)), framework::LogLevel::ERRORS);
+
+    // Positive testing on XY indexing
+    ARM_COMPUTE_EXPECT_NO_THROW(SubTensorInfo(&info, TensorShape(4U, 3U, 2U), Coordinates(5, 2, 1)), framework::LogLevel::ERRORS);
+}
+
+/** Validate when extending padding on sub-tensor
+ *
+ * Tests performed:
+ *  - A) Extend padding when SubTensor XY does not match parent tensor should fail
+ *    B) Extend with zero padding when SubTensor XY does not match parent tensor should succeed
+ *  - C) Extend padding when SubTensor XY matches parent tensor should succeed
+ */
+TEST_CASE(SubTensorPaddingExpansion, framework::DatasetMode::ALL)
+{
+    // Test A
+    {
+        TensorInfo    tensor_info(TensorShape(23U, 17U, 3U), 1, DataType::F32);
+        SubTensorInfo sub_tensor_info(&tensor_info, TensorShape(4U, 3U, 2U), Coordinates(5, 2, 1));
+        ARM_COMPUTE_EXPECT_THROW(sub_tensor_info.extend_padding(PaddingSize(2, 1)), framework::LogLevel::ERRORS);
+    }
+
+    // Test B
+    {
+        TensorInfo    tensor_info(TensorShape(23U, 17U, 3U), 1, DataType::F32);
+        SubTensorInfo sub_tensor_info(&tensor_info, TensorShape(4U, 3U, 1U), Coordinates(5, 2, 1));
+        ARM_COMPUTE_EXPECT_NO_THROW(sub_tensor_info.extend_padding(PaddingSize(0, 0)), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(tensor_info.padding().uniform(), framework::LogLevel::ERRORS);
+    }
+
+    // Test C
+    {
+        TensorInfo    tensor_info(TensorShape(23U, 17U, 3U), 1, DataType::F32);
+        SubTensorInfo sub_tensor_info(&tensor_info, TensorShape(23U, 17U, 1U), Coordinates(0, 0, 1));
+        ARM_COMPUTE_EXPECT_NO_THROW(sub_tensor_info.extend_padding(PaddingSize(2, 1)), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(tensor_info.padding().top == 2, framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(tensor_info.padding().right == 1, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_SUITE_END() // SubTensorInfo
+TEST_SUITE_END()
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/UNIT/TensorInfo.cpp b/tests/validation/UNIT/TensorInfo.cpp
index 009c757..b5928cc 100644
--- a/tests/validation/UNIT/TensorInfo.cpp
+++ b/tests/validation/UNIT/TensorInfo.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/UNIT/TensorShape.cpp b/tests/validation/UNIT/TensorShape.cpp
index 31f95e3..ebe9b32 100644
--- a/tests/validation/UNIT/TensorShape.cpp
+++ b/tests/validation/UNIT/TensorShape.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/UNIT/Utils.cpp b/tests/validation/UNIT/Utils.cpp
index 3980674..9309278 100644
--- a/tests/validation/UNIT/Utils.cpp
+++ b/tests/validation/UNIT/Utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/UNIT/WindowIterator.cpp b/tests/validation/UNIT/WindowIterator.cpp
index 402cab4..4430299 100644
--- a/tests/validation/UNIT/WindowIterator.cpp
+++ b/tests/validation/UNIT/WindowIterator.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/Validation.cpp b/tests/validation/Validation.cpp
index 89bbb2e..7f819c7 100644
--- a/tests/validation/Validation.cpp
+++ b/tests/validation/Validation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/Validation.h b/tests/validation/Validation.h
index fa33869..2d8d5b3 100644
--- a/tests/validation/Validation.h
+++ b/tests/validation/Validation.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/AbsoluteDifferenceFixture.h b/tests/validation/fixtures/AbsoluteDifferenceFixture.h
index b725304..46118c9 100644
--- a/tests/validation/fixtures/AbsoluteDifferenceFixture.h
+++ b/tests/validation/fixtures/AbsoluteDifferenceFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/AccumulateFixture.h b/tests/validation/fixtures/AccumulateFixture.h
index 0910001..8fa6689 100644
--- a/tests/validation/fixtures/AccumulateFixture.h
+++ b/tests/validation/fixtures/AccumulateFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ActivationLayerFixture.h b/tests/validation/fixtures/ActivationLayerFixture.h
index 551ee2d..91b43f0 100644
--- a/tests/validation/fixtures/ActivationLayerFixture.h
+++ b/tests/validation/fixtures/ActivationLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ArgMinMaxFixture.h b/tests/validation/fixtures/ArgMinMaxFixture.h
index 2932ba4..cf34bcc 100644
--- a/tests/validation/fixtures/ArgMinMaxFixture.h
+++ b/tests/validation/fixtures/ArgMinMaxFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ArithmeticDivisionFixture.h b/tests/validation/fixtures/ArithmeticDivisionFixture.h
index cf9db95..713a6db 100644
--- a/tests/validation/fixtures/ArithmeticDivisionFixture.h
+++ b/tests/validation/fixtures/ArithmeticDivisionFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ArithmeticOperationsFixture.h b/tests/validation/fixtures/ArithmeticOperationsFixture.h
index 4a6b0bd..9ba7bd3 100644
--- a/tests/validation/fixtures/ArithmeticOperationsFixture.h
+++ b/tests/validation/fixtures/ArithmeticOperationsFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,10 +48,11 @@
     template <typename...>
     void setup(reference::ArithmeticOperation op, const TensorShape &shape0, const TensorShape &shape1,
                DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, ActivationLayerInfo act_info)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, ActivationLayerInfo act_info, bool in_place)
     {
         _op        = op;
         _act_info  = act_info;
+        _in_place  = in_place;
         _target    = compute_target(shape0, shape1, data_type0, data_type1, output_data_type, convert_policy, qinfo0, qinfo1, qinfo_out);
         _reference = compute_reference(shape0, shape1, data_type0, data_type1, output_data_type, convert_policy, qinfo0, qinfo1, qinfo_out);
     }
@@ -67,26 +68,27 @@
                               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
     {
         // Create tensors
-        TensorType ref_src1 = create_tensor<TensorType>(shape0, data_type0, 1, qinfo0);
-        TensorType ref_src2 = create_tensor<TensorType>(shape1, data_type1, 1, qinfo1);
-        TensorType dst      = create_tensor<TensorType>(TensorShape::broadcast_shape(shape0, shape1), output_data_type, 1, qinfo_out);
+        TensorType  ref_src1   = create_tensor<TensorType>(shape0, data_type0, 1, qinfo0);
+        TensorType  ref_src2   = create_tensor<TensorType>(shape1, data_type1, 1, qinfo1);
+        TensorType  dst        = create_tensor<TensorType>(TensorShape::broadcast_shape(shape0, shape1), output_data_type, 1, qinfo_out);
+        TensorType *dst_to_use = _in_place ? &ref_src1 : &dst;
 
         // Create and configure function
         FunctionType arith_op;
-        arith_op.configure(&ref_src1, &ref_src2, &dst, convert_policy, _act_info);
+        arith_op.configure(&ref_src1, &ref_src2, dst_to_use, convert_policy, _act_info);
 
         ARM_COMPUTE_EXPECT(ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst_to_use->info()->is_resizable(), framework::LogLevel::ERRORS);
 
         // Allocate tensors
         ref_src1.allocator()->allocate();
         ref_src2.allocator()->allocate();
-        dst.allocator()->allocate();
+        dst_to_use->allocator()->allocate();
 
         ARM_COMPUTE_EXPECT(!ref_src1.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(!ref_src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst_to_use->info()->is_resizable(), framework::LogLevel::ERRORS);
 
         // Fill tensors
         fill(AccessorType(ref_src1), 0);
@@ -95,6 +97,10 @@
         // Compute function
         arith_op.run();
 
+        if(_in_place)
+        {
+            return ref_src1;
+        }
         return dst;
     }
 
@@ -102,23 +108,28 @@
                                       DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy,
                                       QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
     {
+        // current in-place implementation only supports same metadata of input and output tensors.
+        // By ignoring output quantization information here, we can make test cases implementation much simpler.
+        QuantizationInfo output_qinfo = _in_place ? qinfo0 : qinfo_out;
+
         // Create reference
         SimpleTensor<T> ref_src1{ shape0, data_type0, 1, qinfo0 };
         SimpleTensor<T> ref_src2{ shape1, data_type1, 1, qinfo1 };
-        SimpleTensor<T> ref_dst{ TensorShape::broadcast_shape(shape0, shape1), output_data_type, 1, qinfo_out };
+        SimpleTensor<T> ref_dst{ TensorShape::broadcast_shape(shape0, shape1), output_data_type, 1, output_qinfo };
 
         // Fill reference
         fill(ref_src1, 0);
         fill(ref_src2, 1);
 
         auto result = reference::arithmetic_operation<T>(_op, ref_src1, ref_src2, ref_dst, convert_policy);
-        return _act_info.enabled() ? reference::activation_layer(result, _act_info, qinfo_out) : result;
+        return _act_info.enabled() ? reference::activation_layer(result, _act_info, output_qinfo) : result;
     }
 
     TensorType                     _target{};
     SimpleTensor<T>                _reference{};
     reference::ArithmeticOperation _op{ reference::ArithmeticOperation::ADD };
     ActivationLayerInfo            _act_info{};
+    bool                           _in_place{};
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
@@ -129,7 +140,7 @@
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy)
     {
         ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape0, shape1, data_type0, data_type1,
-                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo());
+                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), false);
     }
 };
 
@@ -141,7 +152,7 @@
     void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy)
     {
         ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape, shape, data_type0, data_type1,
-                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo());
+                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), false);
     }
 };
 
@@ -153,7 +164,7 @@
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info)
     {
         ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape0, shape1, data_type0, data_type1,
-                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, false);
     }
 };
 
@@ -165,7 +176,7 @@
     void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info)
     {
         ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape, shape, data_type0, data_type1,
-                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                            output_data_type, convert_policy, QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, false);
     }
 };
 
@@ -179,7 +190,21 @@
 
     {
         ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape, shape, data_type0, data_type1,
-                                                                                            output_data_type, convert_policy, qinfo0, qinfo1, qinfo_out, ActivationLayerInfo());
+                                                                                            output_data_type, convert_policy, qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), false);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArithmeticAdditionValidationQuantizedBroadcastFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type,
+               ConvertPolicy convert_policy, QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+    {
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::ADD, shape0, shape1,
+                                                                                            data_type0, data_type1, output_data_type, convert_policy,
+                                                                                            qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), false);
     }
 };
 
@@ -188,11 +213,11 @@
 {
 public:
     template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, bool in_place)
     {
         ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape0, shape1,
                                                                                             data_type0, data_type1, output_data_type, convert_policy,
-                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo());
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), in_place);
     }
 };
 
@@ -201,11 +226,12 @@
 {
 public:
     template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info,
+               bool in_place)
     {
         ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape0, shape1,
                                                                                             data_type0, data_type1, output_data_type, convert_policy,
-                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, in_place);
     }
 };
 
@@ -214,11 +240,11 @@
 {
 public:
     template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, bool in_place)
     {
         ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape, shape,
                                                                                             data_type0, data_type1, output_data_type, convert_policy,
-                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo());
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), in_place);
     }
 };
 
@@ -227,11 +253,11 @@
 {
 public:
     template <typename...>
-    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy, ActivationLayerInfo act_info, bool in_place)
     {
         ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape, shape,
                                                                                             data_type0, data_type1, output_data_type, convert_policy,
-                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                            QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, in_place);
     }
 };
 
@@ -241,12 +267,26 @@
 public:
     template <typename...>
     void setup(const TensorShape &shape, DataType data_type0, DataType data_type1, DataType output_data_type, ConvertPolicy convert_policy,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool in_place)
 
     {
         ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape, shape,
                                                                                             data_type0, data_type1, output_data_type,
-                                                                                            convert_policy, qinfo0, qinfo1, qinfo_out, ActivationLayerInfo());
+                                                                                            convert_policy, qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), in_place);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArithmeticSubtractionValidationQuantizedBroadcastFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type,
+               ConvertPolicy convert_policy, QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool in_place)
+    {
+        ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape0, shape1,
+                                                                                            data_type0, data_type1, output_data_type, convert_policy,
+                                                                                            qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), in_place);
     }
 };
 } // namespace validation

diff --git a/tests/validation/fixtures/BatchNormalizationLayerFixture.h b/tests/validation/fixtures/BatchNormalizationLayerFixture.h
index 359752f..8a6caac 100644
--- a/tests/validation/fixtures/BatchNormalizationLayerFixture.h
+++ b/tests/validation/fixtures/BatchNormalizationLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/BatchNormalizationLayerFusionFixture.h b/tests/validation/fixtures/BatchNormalizationLayerFusionFixture.h
index 39c7d46..2df7f47 100644
--- a/tests/validation/fixtures/BatchNormalizationLayerFusionFixture.h
+++ b/tests/validation/fixtures/BatchNormalizationLayerFusionFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/BatchToSpaceLayerFixture.h b/tests/validation/fixtures/BatchToSpaceLayerFixture.h
index 973f2ed..ca6d20a 100644
--- a/tests/validation/fixtures/BatchToSpaceLayerFixture.h
+++ b/tests/validation/fixtures/BatchToSpaceLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/BitwiseAndFixture.h b/tests/validation/fixtures/BitwiseAndFixture.h
index c1247de..6c8e1b1 100644
--- a/tests/validation/fixtures/BitwiseAndFixture.h
+++ b/tests/validation/fixtures/BitwiseAndFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/BitwiseNotFixture.h b/tests/validation/fixtures/BitwiseNotFixture.h
index f90d089..c6affcf 100644
--- a/tests/validation/fixtures/BitwiseNotFixture.h
+++ b/tests/validation/fixtures/BitwiseNotFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/BitwiseOrFixture.h b/tests/validation/fixtures/BitwiseOrFixture.h
index d6ec5b8..a40f635 100644
--- a/tests/validation/fixtures/BitwiseOrFixture.h
+++ b/tests/validation/fixtures/BitwiseOrFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/BitwiseXorFixture.h b/tests/validation/fixtures/BitwiseXorFixture.h
index 8da2181..c103033 100644
--- a/tests/validation/fixtures/BitwiseXorFixture.h
+++ b/tests/validation/fixtures/BitwiseXorFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/BoundingBoxTransformFixture.h b/tests/validation/fixtures/BoundingBoxTransformFixture.h
index 5e4c598..7155848 100644
--- a/tests/validation/fixtures/BoundingBoxTransformFixture.h
+++ b/tests/validation/fixtures/BoundingBoxTransformFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/Box3x3Fixture.h b/tests/validation/fixtures/Box3x3Fixture.h
index e851b34..8caeec8 100644
--- a/tests/validation/fixtures/Box3x3Fixture.h
+++ b/tests/validation/fixtures/Box3x3Fixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/CannyEdgeFixture.h b/tests/validation/fixtures/CannyEdgeFixture.h
index d52b17e..8e82e6d 100644
--- a/tests/validation/fixtures/CannyEdgeFixture.h
+++ b/tests/validation/fixtures/CannyEdgeFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/CastFixture.h b/tests/validation/fixtures/CastFixture.h
index 3a6efa2..c9764af 100644
--- a/tests/validation/fixtures/CastFixture.h
+++ b/tests/validation/fixtures/CastFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,7 @@
             {
                 case DataType::U8:
                 case DataType::QASYMM8:
+                case DataType::QASYMM8_SIGNED:
                 case DataType::S8:
                 case DataType::F32:
                 {

diff --git a/tests/validation/fixtures/ChannelCombineFixture.h b/tests/validation/fixtures/ChannelCombineFixture.h
index 68d0237..f0d927a 100644
--- a/tests/validation/fixtures/ChannelCombineFixture.h
+++ b/tests/validation/fixtures/ChannelCombineFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ChannelExtractFixture.h b/tests/validation/fixtures/ChannelExtractFixture.h
index c3c2e17..2f5694f 100644
--- a/tests/validation/fixtures/ChannelExtractFixture.h
+++ b/tests/validation/fixtures/ChannelExtractFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ChannelShuffleLayerFixture.h b/tests/validation/fixtures/ChannelShuffleLayerFixture.h
index c9aae2d..de718fb 100644
--- a/tests/validation/fixtures/ChannelShuffleLayerFixture.h
+++ b/tests/validation/fixtures/ChannelShuffleLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/Col2ImFixture.h b/tests/validation/fixtures/Col2ImFixture.h
index 5488f8a..f8673af 100644
--- a/tests/validation/fixtures/Col2ImFixture.h
+++ b/tests/validation/fixtures/Col2ImFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ColorConvertFixture.h b/tests/validation/fixtures/ColorConvertFixture.h
index cbaff6d..a5ed554 100644
--- a/tests/validation/fixtures/ColorConvertFixture.h
+++ b/tests/validation/fixtures/ColorConvertFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ComparisonFixture.h b/tests/validation/fixtures/ComparisonFixture.h
index d1e1a53..43da0ae 100644
--- a/tests/validation/fixtures/ComparisonFixture.h
+++ b/tests/validation/fixtures/ComparisonFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ComputeAllAnchorsFixture.h b/tests/validation/fixtures/ComputeAllAnchorsFixture.h
index e837bd4..f385cb8 100644
--- a/tests/validation/fixtures/ComputeAllAnchorsFixture.h
+++ b/tests/validation/fixtures/ComputeAllAnchorsFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ConcatenateLayerFixture.h b/tests/validation/fixtures/ConcatenateLayerFixture.h
index d1eed63..e85f81c 100644
--- a/tests/validation/fixtures/ConcatenateLayerFixture.h
+++ b/tests/validation/fixtures/ConcatenateLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,9 +43,12 @@
 {
 namespace validation
 {
-template <typename TensorType, typename ITensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename ITensorType, typename AccessorType, typename FunctionType, typename T, bool CI = true>
 class ConcatenateLayerValidationFixture : public framework::Fixture
 {
+private:
+    using SrcITensorType = typename std::conditional<CI, const ITensorType, ITensorType>::type;
+
 public:
     template <typename...>
     void setup(TensorShape shape, DataType data_type, unsigned int axis)
@@ -95,8 +98,8 @@
 
     TensorType compute_target(const std::vector<TensorShape> &shapes, const std::vector<QuantizationInfo> &qinfo, DataType data_type, unsigned int axis)
     {
-        std::vector<TensorType>    srcs;
-        std::vector<ITensorType *> src_ptrs;
+        std::vector<TensorType>       srcs;
+        std::vector<SrcITensorType *> src_ptrs;
 
         // Create tensors
         srcs.reserve(shapes.size());

diff --git a/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h b/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h
index 0fcef5c..c90409f 100644
--- a/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h
+++ b/tests/validation/fixtures/ConvertFullyConnectedWeightsFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ConvolutionFixture.h b/tests/validation/fixtures/ConvolutionFixture.h
index 04172a6..4692e2f 100644
--- a/tests/validation/fixtures/ConvolutionFixture.h
+++ b/tests/validation/fixtures/ConvolutionFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index b4abebe..ec13e1d 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/CopyFixture.h b/tests/validation/fixtures/CopyFixture.h
index 534d5b3..feb1d7d 100644
--- a/tests/validation/fixtures/CopyFixture.h
+++ b/tests/validation/fixtures/CopyFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/CropResizeFixture.h b/tests/validation/fixtures/CropResizeFixture.h
index 450c68e..4f63891 100644
--- a/tests/validation/fixtures/CropResizeFixture.h
+++ b/tests/validation/fixtures/CropResizeFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/DeconvolutionLayerFixture.h b/tests/validation/fixtures/DeconvolutionLayerFixture.h
index 57951c0..7c7f32b 100644
--- a/tests/validation/fixtures/DeconvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DeconvolutionLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/DepthConvertLayerFixture.h b/tests/validation/fixtures/DepthConvertLayerFixture.h
index 3fe1270..fa9e6a7 100644
--- a/tests/validation/fixtures/DepthConvertLayerFixture.h
+++ b/tests/validation/fixtures/DepthConvertLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/DepthToSpaceLayerFixture.h b/tests/validation/fixtures/DepthToSpaceLayerFixture.h
index b42bed5..8c2f561 100644
--- a/tests/validation/fixtures/DepthToSpaceLayerFixture.h
+++ b/tests/validation/fixtures/DepthToSpaceLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index 7016e9f..af3838f 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/DequantizationLayerFixture.h b/tests/validation/fixtures/DequantizationLayerFixture.h
index 3699613..1c1f46a 100644
--- a/tests/validation/fixtures/DequantizationLayerFixture.h
+++ b/tests/validation/fixtures/DequantizationLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/DerivativeFixture.h b/tests/validation/fixtures/DerivativeFixture.h
index 2df3340..e520a9e 100644
--- a/tests/validation/fixtures/DerivativeFixture.h
+++ b/tests/validation/fixtures/DerivativeFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/DilateFixture.h b/tests/validation/fixtures/DilateFixture.h
index aa531a1..51ed4df 100644
--- a/tests/validation/fixtures/DilateFixture.h
+++ b/tests/validation/fixtures/DilateFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/DirectConvolutionLayerFixture.h b/tests/validation/fixtures/DirectConvolutionLayerFixture.h
index c4e4180..3da5158 100644
--- a/tests/validation/fixtures/DirectConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DirectConvolutionLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h b/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h
index bf6f3e2..b313c84 100644
--- a/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h
+++ b/tests/validation/fixtures/DirectConvolutionLayerTensorShiftFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/DropoutLayerFixture.h b/tests/validation/fixtures/DropoutLayerFixture.h
index be25802..63df936 100644
--- a/tests/validation/fixtures/DropoutLayerFixture.h
+++ b/tests/validation/fixtures/DropoutLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ElementWiseUnaryFixture.h b/tests/validation/fixtures/ElementWiseUnaryFixture.h
index 3f6d5b3..f04463d 100644
--- a/tests/validation/fixtures/ElementWiseUnaryFixture.h
+++ b/tests/validation/fixtures/ElementWiseUnaryFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,10 +44,10 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape input_shape, DataType input_data_type, ElementWiseUnary op)
+    void setup(TensorShape input_shape, DataType input_data_type, bool in_place, ElementWiseUnary op)
     {
         _op        = op;
-        _target    = compute_target(input_shape, input_data_type);
+        _target    = compute_target(input_shape, input_data_type, in_place);
         _reference = compute_reference(input_shape, input_data_type);
     }
 
@@ -115,25 +115,27 @@
         }
     }
 
-    TensorType compute_target(const TensorShape &shape, DataType data_type)
+    TensorType compute_target(const TensorShape &shape, DataType data_type, bool in_place)
     {
         // Create tensors
         TensorType src = create_tensor<TensorType>(shape, data_type);
         TensorType dst = create_tensor<TensorType>(shape, data_type);
 
+        TensorType *actual_dst = in_place ? &src : &dst;
+
         // Create and configure function
         FunctionType elwiseunary_layer;
-
-        elwiseunary_layer.configure(&src, &dst);
+        elwiseunary_layer.configure(&src, actual_dst);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-        // Allocate tensors
         src.allocator()->allocate();
-        dst.allocator()->allocate();
         ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        if(!in_place)
+        {
+            ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+            dst.allocator()->allocate();
+            ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
 
         // Fill tensors
         fill(AccessorType(src), 0, data_type);
@@ -141,7 +143,14 @@
         // Compute function
         elwiseunary_layer.run();
 
-        return dst;
+        if(in_place)
+        {
+            return src;
+        }
+        else
+        {
+            return dst;
+        }
     }
 
     SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type)
@@ -167,7 +176,7 @@
     template <typename...>
     void setup(const TensorShape &shape, DataType data_type)
     {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, ElementWiseUnary::RSQRT);
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::RSQRT);
     }
 };
 
@@ -178,7 +187,7 @@
     template <typename...>
     void setup(const TensorShape &shape, DataType data_type)
     {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, ElementWiseUnary::EXP);
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::EXP);
     }
 };
 
@@ -189,7 +198,18 @@
     template <typename...>
     void setup(const TensorShape &shape, DataType data_type)
     {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, ElementWiseUnary::NEG);
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::NEG);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class NegValidationInPlaceFixture : public ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+    template <typename...>
+    void setup(const TensorShape &shape, DataType data_type, bool in_place)
+    {
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, in_place, ElementWiseUnary::NEG);
     }
 };
 
@@ -200,7 +220,7 @@
     template <typename...>
     void setup(const TensorShape &shape, DataType data_type)
     {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, ElementWiseUnary::LOG);
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::LOG);
     }
 };
 
@@ -211,7 +231,7 @@
     template <typename...>
     void setup(const TensorShape &shape, DataType data_type)
     {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, ElementWiseUnary::ABS);
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::ABS);
     }
 };
 
@@ -222,7 +242,7 @@
     template <typename...>
     void setup(const TensorShape &shape, DataType data_type)
     {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, ElementWiseUnary::SIN);
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::SIN);
     }
 };
 
@@ -233,7 +253,7 @@
     template <typename...>
     void setup(const TensorShape &shape, DataType data_type)
     {
-        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, ElementWiseUnary::ROUND);
+        ElementWiseUnaryValidationFixture<TensorType, AccessorType, FunctionType, T>::setup(shape, data_type, false, ElementWiseUnary::ROUND);
     }
 };
 } // namespace validation

diff --git a/tests/validation/fixtures/ElementwiseOperationsFixture.h b/tests/validation/fixtures/ElementwiseOperationsFixture.h
index 44c096c..ebc52d5 100644
--- a/tests/validation/fixtures/ElementwiseOperationsFixture.h
+++ b/tests/validation/fixtures/ElementwiseOperationsFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/EqualizeHistogramFixture.h b/tests/validation/fixtures/EqualizeHistogramFixture.h
index 0d91e6d..f7a0312 100644
--- a/tests/validation/fixtures/EqualizeHistogramFixture.h
+++ b/tests/validation/fixtures/EqualizeHistogramFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ErodeFixture.h b/tests/validation/fixtures/ErodeFixture.h
index d37cac0..b9f17a2 100644
--- a/tests/validation/fixtures/ErodeFixture.h
+++ b/tests/validation/fixtures/ErodeFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/FFTFixture.h b/tests/validation/fixtures/FFTFixture.h
index 1aaa596..dad774c 100644
--- a/tests/validation/fixtures/FFTFixture.h
+++ b/tests/validation/fixtures/FFTFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/FastCornersFixture.h b/tests/validation/fixtures/FastCornersFixture.h
index 6f2add6..ae66c37 100644
--- a/tests/validation/fixtures/FastCornersFixture.h
+++ b/tests/validation/fixtures/FastCornersFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/FillFixture.h b/tests/validation/fixtures/FillFixture.h
index c9817a3..706c135 100644
--- a/tests/validation/fixtures/FillFixture.h
+++ b/tests/validation/fixtures/FillFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/FlattenLayerFixture.h b/tests/validation/fixtures/FlattenLayerFixture.h
index d170806..9627983 100644
--- a/tests/validation/fixtures/FlattenLayerFixture.h
+++ b/tests/validation/fixtures/FlattenLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/FloorFixture.h b/tests/validation/fixtures/FloorFixture.h
index 246105e..9388486 100644
--- a/tests/validation/fixtures/FloorFixture.h
+++ b/tests/validation/fixtures/FloorFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/FullyConnectedLayerFixture.h b/tests/validation/fixtures/FullyConnectedLayerFixture.h
index 6952b22..86d39c0 100644
--- a/tests/validation/fixtures/FullyConnectedLayerFixture.h
+++ b/tests/validation/fixtures/FullyConnectedLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/FuseBatchNormalizationFixture.h b/tests/validation/fixtures/FuseBatchNormalizationFixture.h
index 780b4a0..552dc7c 100644
--- a/tests/validation/fixtures/FuseBatchNormalizationFixture.h
+++ b/tests/validation/fixtures/FuseBatchNormalizationFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index efe7567..0a964a7 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -678,7 +678,7 @@
 public:
     template <typename...>
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool interleave_lhs,
-               bool interleave_rhs, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info)
+               bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info)
     {
         GEMMLHSMatrixInfo lhs_info;
         lhs_info.m0         = m0;
@@ -688,11 +688,12 @@
         lhs_info.transpose  = lhs_transpose;
 
         GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0         = n0;
-        rhs_info.k0         = k0;
-        rhs_info.h0         = h0;
-        rhs_info.interleave = interleave_rhs;
-        rhs_info.transpose  = !lhs_transpose;
+        rhs_info.n0                 = n0;
+        rhs_info.k0                 = k0;
+        rhs_info.h0                 = h0;
+        rhs_info.interleave         = interleave_rhs;
+        rhs_info.transpose          = !lhs_transpose;
+        rhs_info.export_to_cl_image = export_to_cl_image;
 
         // Set the tensor shapes for LHS and RHS matrices
         const TensorShape lhs_shape(k, m, batch_size);
@@ -833,8 +834,7 @@
 public:
     template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-               bool interleave_lhs,
-               bool interleave_rhs, DataType data_type, float alpha, float beta, bool lhs_transpose, const ActivationLayerInfo &act_info)
+               bool interleave_lhs, bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool lhs_transpose, const ActivationLayerInfo &act_info)
     {
         GEMMLHSMatrixInfo lhs_info;
         lhs_info.m0         = m0;
@@ -844,11 +844,12 @@
         lhs_info.transpose  = lhs_transpose;
 
         GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0         = n0;
-        rhs_info.k0         = k0;
-        rhs_info.h0         = h0;
-        rhs_info.interleave = interleave_rhs;
-        rhs_info.transpose  = !lhs_transpose;
+        rhs_info.n0                 = n0;
+        rhs_info.k0                 = k0;
+        rhs_info.h0                 = h0;
+        rhs_info.interleave         = interleave_rhs;
+        rhs_info.transpose          = !lhs_transpose;
+        rhs_info.export_to_cl_image = export_to_cl_image;
 
         // In case of GEMM3D, m is the product between m_w and m_h
         const unsigned int m = m_w * m_h;
@@ -985,18 +986,19 @@
 public:
     template <typename...>
     void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0,
-               bool interleave_rhs, bool transpose_rhs, DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info)
+               bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info)
     {
         GEMMLHSMatrixInfo lhs_info;
         lhs_info.m0 = m0;
         lhs_info.k0 = k0;
 
         GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0         = n0;
-        rhs_info.k0         = k0;
-        rhs_info.h0         = h0;
-        rhs_info.interleave = interleave_rhs;
-        rhs_info.transpose  = transpose_rhs;
+        rhs_info.n0                 = n0;
+        rhs_info.k0                 = k0;
+        rhs_info.h0                 = h0;
+        rhs_info.interleave         = interleave_rhs;
+        rhs_info.transpose          = transpose_rhs;
+        rhs_info.export_to_cl_image = export_to_cl_image;
 
         // Set the tensor shapes for LHS and RHS matrices
         const TensorShape lhs_shape(k, m, batch_size);
@@ -1123,18 +1125,19 @@
 public:
     template <typename...>
     void setup(unsigned int m_w, unsigned int m_h, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0,
-               bool interleave_rhs, bool transpose_rhs, DataType data_type, float alpha, float beta, const ActivationLayerInfo &act_info)
+               bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, const ActivationLayerInfo &act_info)
     {
         GEMMLHSMatrixInfo lhs_info;
         lhs_info.m0 = m0;
         lhs_info.k0 = k0;
 
         GEMMRHSMatrixInfo rhs_info;
-        rhs_info.n0         = n0;
-        rhs_info.k0         = k0;
-        rhs_info.h0         = h0;
-        rhs_info.interleave = interleave_rhs;
-        rhs_info.transpose  = transpose_rhs;
+        rhs_info.n0                 = n0;
+        rhs_info.k0                 = k0;
+        rhs_info.h0                 = h0;
+        rhs_info.interleave         = interleave_rhs;
+        rhs_info.transpose          = transpose_rhs;
+        rhs_info.export_to_cl_image = export_to_cl_image;
 
         // In case of GEMM3D, m is the product between m_w and m_h
         const unsigned int m = m_w * m_h;

diff --git a/tests/validation/fixtures/GEMMInterleave4x4Fixture.h b/tests/validation/fixtures/GEMMInterleave4x4Fixture.h
index 26ffd36..fb04a08 100644
--- a/tests/validation/fixtures/GEMMInterleave4x4Fixture.h
+++ b/tests/validation/fixtures/GEMMInterleave4x4Fixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/GEMMLowpAssemblyFixture.h b/tests/validation/fixtures/GEMMLowpAssemblyFixture.h
index 0394906..e9ec1bc 100644
--- a/tests/validation/fixtures/GEMMLowpAssemblyFixture.h
+++ b/tests/validation/fixtures/GEMMLowpAssemblyFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index e3dc738..94b25e2 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/GEMMReshapeLHSMatrixFixture.h b/tests/validation/fixtures/GEMMReshapeLHSMatrixFixture.h
index 3a5ab7c..d085509 100644
--- a/tests/validation/fixtures/GEMMReshapeLHSMatrixFixture.h
+++ b/tests/validation/fixtures/GEMMReshapeLHSMatrixFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/GEMMReshapeRHSMatrixFixture.h b/tests/validation/fixtures/GEMMReshapeRHSMatrixFixture.h
index e03c4f3..99bfa3b 100644
--- a/tests/validation/fixtures/GEMMReshapeRHSMatrixFixture.h
+++ b/tests/validation/fixtures/GEMMReshapeRHSMatrixFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/GEMMTranspose1xWFixture.h b/tests/validation/fixtures/GEMMTranspose1xWFixture.h
index 89d2238..9619a66 100644
--- a/tests/validation/fixtures/GEMMTranspose1xWFixture.h
+++ b/tests/validation/fixtures/GEMMTranspose1xWFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/GatherFixture.h b/tests/validation/fixtures/GatherFixture.h
index f2dcd4a..0a9f8c1 100644
--- a/tests/validation/fixtures/GatherFixture.h
+++ b/tests/validation/fixtures/GatherFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/Gaussian3x3Fixture.h b/tests/validation/fixtures/Gaussian3x3Fixture.h
index 396e63e..4a154ea 100644
--- a/tests/validation/fixtures/Gaussian3x3Fixture.h
+++ b/tests/validation/fixtures/Gaussian3x3Fixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/Gaussian5x5Fixture.h b/tests/validation/fixtures/Gaussian5x5Fixture.h
index 31d7acf..68f91e1 100644
--- a/tests/validation/fixtures/Gaussian5x5Fixture.h
+++ b/tests/validation/fixtures/Gaussian5x5Fixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/GaussianPyramidHalfFixture.h b/tests/validation/fixtures/GaussianPyramidHalfFixture.h
index ef7657a..f91b1d5 100644
--- a/tests/validation/fixtures/GaussianPyramidHalfFixture.h
+++ b/tests/validation/fixtures/GaussianPyramidHalfFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/HOGDescriptorFixture.h b/tests/validation/fixtures/HOGDescriptorFixture.h
index 6097059..1021e12 100644
--- a/tests/validation/fixtures/HOGDescriptorFixture.h
+++ b/tests/validation/fixtures/HOGDescriptorFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/HOGDetectorFixture.h b/tests/validation/fixtures/HOGDetectorFixture.h
index c2d0514..f12e65b 100644
--- a/tests/validation/fixtures/HOGDetectorFixture.h
+++ b/tests/validation/fixtures/HOGDetectorFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/HOGMultiDetectionFixture.h b/tests/validation/fixtures/HOGMultiDetectionFixture.h
index 039f3f4..c37bdb6 100644
--- a/tests/validation/fixtures/HOGMultiDetectionFixture.h
+++ b/tests/validation/fixtures/HOGMultiDetectionFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/HarrisCornersFixture.h b/tests/validation/fixtures/HarrisCornersFixture.h
index f1d1f2d..dbe77dd 100644
--- a/tests/validation/fixtures/HarrisCornersFixture.h
+++ b/tests/validation/fixtures/HarrisCornersFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/HistogramFixture.h b/tests/validation/fixtures/HistogramFixture.h
index 7349bdf..dceb23b 100644
--- a/tests/validation/fixtures/HistogramFixture.h
+++ b/tests/validation/fixtures/HistogramFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/Im2ColFixture.h b/tests/validation/fixtures/Im2ColFixture.h
index 809bafd..e1f33a3 100644
--- a/tests/validation/fixtures/Im2ColFixture.h
+++ b/tests/validation/fixtures/Im2ColFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,11 +59,6 @@
         _has_bias    = data_type != DataType::QASYMM8;
         _num_groups  = num_groups;
 
-        if(_data_layout == DataLayout::NHWC)
-        {
-            permute(input_shape, PermutationVector(2U, 0U, 1U));
-        }
-
         TensorInfo input_info(input_shape, 1, data_type);
         input_info.set_data_layout(_data_layout);
 

diff --git a/tests/validation/fixtures/InstanceNormalizationLayerFixture.h b/tests/validation/fixtures/InstanceNormalizationLayerFixture.h
index 5e230d4..06ff4d3 100644
--- a/tests/validation/fixtures/InstanceNormalizationLayerFixture.h
+++ b/tests/validation/fixtures/InstanceNormalizationLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/IntegralImageFixture.h b/tests/validation/fixtures/IntegralImageFixture.h
index 8d2149e..abc9973 100644
--- a/tests/validation/fixtures/IntegralImageFixture.h
+++ b/tests/validation/fixtures/IntegralImageFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/L2NormalizeLayerFixture.h b/tests/validation/fixtures/L2NormalizeLayerFixture.h
index e3e1510..c617f10 100644
--- a/tests/validation/fixtures/L2NormalizeLayerFixture.h
+++ b/tests/validation/fixtures/L2NormalizeLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/LSTMLayerFixture.h b/tests/validation/fixtures/LSTMLayerFixture.h
index 858ee07..bf785bb 100644
--- a/tests/validation/fixtures/LSTMLayerFixture.h
+++ b/tests/validation/fixtures/LSTMLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/LaplacianPyramidFixture.h b/tests/validation/fixtures/LaplacianPyramidFixture.h
index 6344272..7131996 100644
--- a/tests/validation/fixtures/LaplacianPyramidFixture.h
+++ b/tests/validation/fixtures/LaplacianPyramidFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/LaplacianReconstructFixture.h b/tests/validation/fixtures/LaplacianReconstructFixture.h
index dfa3023..35432ee 100644
--- a/tests/validation/fixtures/LaplacianReconstructFixture.h
+++ b/tests/validation/fixtures/LaplacianReconstructFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/LocallyConnectedFixture.h b/tests/validation/fixtures/LocallyConnectedFixture.h
index f87e6e4..2e2b716 100644
--- a/tests/validation/fixtures/LocallyConnectedFixture.h
+++ b/tests/validation/fixtures/LocallyConnectedFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/MagnitudeFixture.h b/tests/validation/fixtures/MagnitudeFixture.h
index 0930fb4..81f4970 100644
--- a/tests/validation/fixtures/MagnitudeFixture.h
+++ b/tests/validation/fixtures/MagnitudeFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/MaxUnpoolingLayerFixture.h b/tests/validation/fixtures/MaxUnpoolingLayerFixture.h
new file mode 100644
index 0000000..086bd6c
--- /dev/null
+++ b/tests/validation/fixtures/MaxUnpoolingLayerFixture.h

@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_POOLING_LAYER_FIXTURE
+#define ARM_COMPUTE_TEST_POOLING_LAYER_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/MaxUnpoolingLayer.h"
+#include "tests/validation/reference/PoolingLayer.h"
+#include <random>
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename PoolingFunctionType, typename MaxUnpoolingFunctionType, typename T>
+class MaxUnpoolingLayerValidationGenericFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, PoolingLayerInfo pool_info, DataType data_type, DataLayout data_layout)
+    {
+        std::mt19937                    gen(library->seed());
+        std::uniform_int_distribution<> offset_dis(0, 20);
+        const float                     scale     = data_type == DataType::QASYMM8_SIGNED ? 1.f / 127.f : 1.f / 255.f;
+        const int                       scale_in  = data_type == DataType::QASYMM8_SIGNED ? -offset_dis(gen) : offset_dis(gen);
+        const int                       scale_out = data_type == DataType::QASYMM8_SIGNED ? -offset_dis(gen) : offset_dis(gen);
+        const QuantizationInfo          input_qinfo(scale, scale_in);
+        const QuantizationInfo          output_qinfo(scale, scale_out);
+        _pool_info = pool_info;
+        _target    = compute_target(shape, pool_info, data_type, data_layout, input_qinfo, output_qinfo);
+        _reference = compute_reference(shape, pool_info, data_type, input_qinfo, output_qinfo);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        if(!is_data_type_quantized(tensor.data_type()))
+        {
+            std::uniform_real_distribution<> distribution(-1.f, 1.f);
+            library->fill(tensor, distribution, 0);
+        }
+        else // data type is quantized_asymmetric
+        {
+            library->fill_tensor_uniform(tensor, 0);
+        }
+    }
+
+    TensorType compute_target(TensorShape input_shape, PoolingLayerInfo pool_info,
+                              DataType data_type, DataLayout data_layout,
+                              QuantizationInfo input_qinfo, QuantizationInfo output_qinfo)
+    {
+        // Change shape in case of NHWC.
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(input_shape, PermutationVector(2U, 0U, 1U));
+        }
+
+        // Create tensors
+        TensorType        src       = create_tensor<TensorType>(input_shape, data_type, 1, input_qinfo, data_layout);
+        const TensorShape dst_shape = misc::shape_calculator::compute_pool_shape(*(src.info()), pool_info);
+        TensorType        dst       = create_tensor<TensorType>(dst_shape, data_type, 1, output_qinfo, data_layout);
+        TensorType        unpooled  = create_tensor<TensorType>(input_shape, data_type, 1, output_qinfo, data_layout);
+        TensorType        indices   = create_tensor<TensorType>(dst_shape, DataType::U32, 1, output_qinfo, data_layout);
+
+        // Create and configure function
+        PoolingFunctionType pool_layer;
+        pool_layer.configure(&src, &dst, pool_info, &indices);
+        // Create and configure function
+
+        MaxUnpoolingFunctionType unpool_layer;
+        unpool_layer.configure(&dst, &indices, &unpooled, pool_info);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(indices.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+        indices.allocator()->allocate();
+        unpooled.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!indices.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!unpooled.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(src));
+
+        // Compute function
+        pool_layer.run();
+        unpool_layer.run();
+        return unpooled;
+    }
+
+    SimpleTensor<T> compute_reference(TensorShape input_shape, PoolingLayerInfo info, DataType data_type,
+                                      QuantizationInfo input_qinfo, QuantizationInfo output_qinfo)
+    {
+        SimpleTensor<T>        src(input_shape, data_type, 1, input_qinfo);
+        SimpleTensor<uint32_t> indices{};
+        // Fill reference
+        fill(src);
+        auto pooled_tensor = reference::pooling_layer<T>(src, info, output_qinfo, &indices);
+        return reference::max_unpooling_layer<T>(pooled_tensor, info, output_qinfo, indices, input_shape);
+    }
+
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+    PoolingLayerInfo _pool_info{};
+};
+
+template <typename TensorType, typename AccessorType, typename F1, typename F2, typename T>
+class MaxUnpoolingLayerValidationFixture : public MaxUnpoolingLayerValidationGenericFixture<TensorType, AccessorType, F1, F2, T>
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, PoolingType pool_type, Size2D pool_size, PadStrideInfo pad_stride_info, DataType data_type, DataLayout data_layout)
+    {
+        MaxUnpoolingLayerValidationGenericFixture<TensorType, AccessorType, F1, F2, T>::setup(shape, PoolingLayerInfo(pool_type, pool_size, data_layout, pad_stride_info, true),
+                                                                                              data_type, data_layout);
+    }
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_POOLING_LAYER_FIXTURE */

diff --git a/tests/validation/fixtures/MeanStdDevFixture.h b/tests/validation/fixtures/MeanStdDevFixture.h
index 58d4644..ec0599b 100644
--- a/tests/validation/fixtures/MeanStdDevFixture.h
+++ b/tests/validation/fixtures/MeanStdDevFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h b/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h
index 1c48b74..47aa38e 100644
--- a/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h
+++ b/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/Median3x3Fixture.h b/tests/validation/fixtures/Median3x3Fixture.h
index 0946358..2b97800 100644
--- a/tests/validation/fixtures/Median3x3Fixture.h
+++ b/tests/validation/fixtures/Median3x3Fixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/MinMaxLocationFixture.h b/tests/validation/fixtures/MinMaxLocationFixture.h
index 120a5c4..73466cc 100644
--- a/tests/validation/fixtures/MinMaxLocationFixture.h
+++ b/tests/validation/fixtures/MinMaxLocationFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/NonLinearFilterFixture.h b/tests/validation/fixtures/NonLinearFilterFixture.h
index 78ba0ea..03d2bcd 100644
--- a/tests/validation/fixtures/NonLinearFilterFixture.h
+++ b/tests/validation/fixtures/NonLinearFilterFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/NonMaxSuppressionFixture.h b/tests/validation/fixtures/NonMaxSuppressionFixture.h
index 9299ed6..de5d6d5 100644
--- a/tests/validation/fixtures/NonMaxSuppressionFixture.h
+++ b/tests/validation/fixtures/NonMaxSuppressionFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/NormalizationLayerFixture.h b/tests/validation/fixtures/NormalizationLayerFixture.h
index 4d6ef70..54dfd59 100644
--- a/tests/validation/fixtures/NormalizationLayerFixture.h
+++ b/tests/validation/fixtures/NormalizationLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h b/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h
index b46bd3c..bd84692 100644
--- a/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h
+++ b/tests/validation/fixtures/NormalizePlanarYUVLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/OpticalFlowFixture.h b/tests/validation/fixtures/OpticalFlowFixture.h
index f8f2021..5c3285a 100644
--- a/tests/validation/fixtures/OpticalFlowFixture.h
+++ b/tests/validation/fixtures/OpticalFlowFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/PadLayerFixture.h b/tests/validation/fixtures/PadLayerFixture.h
index 58ca81f..2279c8b 100644
--- a/tests/validation/fixtures/PadLayerFixture.h
+++ b/tests/validation/fixtures/PadLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/PermuteFixture.h b/tests/validation/fixtures/PermuteFixture.h
index 7635173..9bbc0cb 100644
--- a/tests/validation/fixtures/PermuteFixture.h
+++ b/tests/validation/fixtures/PermuteFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/PhaseFixture.h b/tests/validation/fixtures/PhaseFixture.h
index 09badcf..b80d1ae 100644
--- a/tests/validation/fixtures/PhaseFixture.h
+++ b/tests/validation/fixtures/PhaseFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/PixelWiseMultiplicationFixture.h b/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
index f561a37..4eb8385 100644
--- a/tests/validation/fixtures/PixelWiseMultiplicationFixture.h
+++ b/tests/validation/fixtures/PixelWiseMultiplicationFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,10 +56,12 @@
                QuantizationInfo    qinfo0,
                QuantizationInfo    qinfo1,
                QuantizationInfo    qinfo_out,
-               ActivationLayerInfo act_info)
+               ActivationLayerInfo act_info,
+               bool                is_inplace)
     {
-        _target    = compute_target(shape0, shape1, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy, qinfo0, qinfo1, qinfo_out, act_info);
-        _reference = compute_reference(shape0, shape1, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy, qinfo0, qinfo1, qinfo_out, act_info);
+        _is_inplace = is_inplace;
+        _target     = compute_target(shape0, shape1, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy, qinfo0, qinfo1, qinfo_out, act_info);
+        _reference  = compute_reference(shape0, shape1, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy, qinfo0, qinfo1, qinfo_out, act_info);
     }
 
 protected:
@@ -78,22 +80,24 @@
         TensorType src2 = create_tensor<TensorType>(shape1, dt_in2, 1, qinfo1);
         TensorType dst  = create_tensor<TensorType>(TensorShape::broadcast_shape(shape0, shape1), dt_out, 1, qinfo_out);
 
+        auto allocate_tensor = [](TensorType & t)
+        {
+            ARM_COMPUTE_EXPECT(t.info()->is_resizable(), framework::LogLevel::ERRORS);
+            t.allocator()->allocate();
+            ARM_COMPUTE_EXPECT(!t.info()->is_resizable(), framework::LogLevel::ERRORS);
+        };
+
         // Create and configure function
         FunctionType multiply;
-        multiply.configure(&src1, &src2, &dst, scale, convert_policy, rounding_policy, act_info);
+        multiply.configure(&src1, &src2, (_is_inplace ? &src1 : &dst), scale, convert_policy, rounding_policy, act_info);
 
-        ARM_COMPUTE_EXPECT(src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        allocate_tensor(src1);
+        allocate_tensor(src2);
 
-        // Allocate tensors
-        src1.allocator()->allocate();
-        src2.allocator()->allocate();
-        dst.allocator()->allocate();
-
-        ARM_COMPUTE_EXPECT(!src1.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!src2.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        if(!_is_inplace)
+        {
+            allocate_tensor(dst);
+        }
 
         // Fill tensors
         fill(AccessorType(src1), 0);
@@ -102,6 +106,11 @@
         // Compute function
         multiply.run();
 
+        if(_is_inplace)
+        {
+            return src1;
+        }
+
         return dst;
     }
 
@@ -113,39 +122,45 @@
         SimpleTensor<T1> src1{ shape0, dt_in1, 1, qinfo0 };
         SimpleTensor<T2> src2{ shape1, dt_in2, 1, qinfo1 };
 
+        // current in-place implementation only supports same metadata of input and output tensors.
+        // By ignoring output quantization information here, we can make test cases implementation much simpler.
+        QuantizationInfo output_qinfo = _is_inplace ? qinfo0 : qinfo_out;
+
         // Fill reference
         fill(src1, 0);
         fill(src2, 1);
 
-        auto result = reference::pixel_wise_multiplication<T1, T2, T3>(src1, src2, scale, convert_policy, rounding_policy, dt_out, qinfo_out);
-        return act_info.enabled() ? reference::activation_layer(result, act_info, qinfo_out) : result;
+        auto result = reference::pixel_wise_multiplication<T1, T2, T3>(src1, src2, scale, convert_policy, rounding_policy, dt_out, output_qinfo);
+        return act_info.enabled() ? reference::activation_layer(result, act_info, output_qinfo) : result;
     }
 
     TensorType       _target{};
     SimpleTensor<T3> _reference{};
+    bool             _is_inplace{ false };
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
-class PixelWiseMultiplicationValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2, typename T3 = T2>
+class PixelWiseMultiplicationValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>
 {
 public:
     template <typename...>
-    void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+    void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, bool is_inplace)
     {
-        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, shape, dt_in1, dt_in2, dt_in2, scale, convert_policy, rounding_policy,
-                                                                                                               QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo());
+        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>::setup(shape, shape, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy,
+                                                                                                                   QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), is_inplace);
     }
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2>
-class PixelWiseMultiplicationBroadcastValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2, typename T3 = T2>
+class PixelWiseMultiplicationBroadcastValidationFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>
 {
 public:
     template <typename...>
-    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy)
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
+               bool is_inplace)
     {
-        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape0, shape1, dt_in1, dt_in2, dt_in2, scale, convert_policy, rounding_policy,
-                                                                                                               QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo());
+        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>::setup(shape0, shape1, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy,
+                                                                                                                   QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), is_inplace);
     }
 };
 
@@ -154,10 +169,10 @@
 {
 public:
     template <typename...>
-    void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, ActivationLayerInfo act_info)
+    void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy, ActivationLayerInfo act_info, bool is_inplace)
     {
         PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape, shape, dt_in1, dt_in2, dt_in2, scale, convert_policy, rounding_policy,
-                                                                                                               QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                               QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -167,10 +182,10 @@
 public:
     template <typename...>
     void setup(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
-               ActivationLayerInfo act_info)
+               ActivationLayerInfo act_info, bool is_inplace)
     {
         PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2>::setup(shape0, shape1, dt_in1, dt_in2, dt_in2, scale, convert_policy, rounding_policy,
-                                                                                                               QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info);
+                                                                                                               QuantizationInfo(), QuantizationInfo(), QuantizationInfo(), act_info, is_inplace);
     }
 };
 
@@ -180,10 +195,23 @@
 public:
     template <typename...>
     void setup(const TensorShape &shape, DataType dt_in1, DataType dt_in2, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
-               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
     {
         PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>::setup(shape, shape, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy,
-                                                                                                                   qinfo0, qinfo1, qinfo_out, ActivationLayerInfo());
+                                                                                                                   qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), is_inplace);
+    }
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T1, typename T2, typename T3 = T2>
+class PixelWiseMultiplicationBroadcastValidationQuantizedFixture : public PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>
+{
+public:
+    template <typename...>
+    void setup(const TensorShape &shape0, const TensorShape &shape1, DataType dt_in1, DataType dt_in2, DataType dt_out, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
+               QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out, bool is_inplace)
+    {
+        PixelWiseMultiplicationGenericValidationFixture<TensorType, AccessorType, FunctionType, T1, T2, T3>::setup(shape0, shape1, dt_in1, dt_in2, dt_out, scale, convert_policy, rounding_policy,
+                                                                                                                   qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), is_inplace);
     }
 };
 } // namespace validation

diff --git a/tests/validation/fixtures/PoolingLayerFixture.h b/tests/validation/fixtures/PoolingLayerFixture.h
index eb40cea..9cd1c46 100644
--- a/tests/validation/fixtures/PoolingLayerFixture.h
+++ b/tests/validation/fixtures/PoolingLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,7 +86,6 @@
         {
             permute(shape, PermutationVector(2U, 0U, 1U));
         }
-
         // Create tensors
         TensorType        src       = create_tensor<TensorType>(shape, data_type, 1, input_qinfo, data_layout);
         const TensorShape dst_shape = misc::shape_calculator::compute_pool_shape(*(src.info()), info);

diff --git a/tests/validation/fixtures/PriorBoxLayerFixture.h b/tests/validation/fixtures/PriorBoxLayerFixture.h
index fb15631..ef18c0d 100644
--- a/tests/validation/fixtures/PriorBoxLayerFixture.h
+++ b/tests/validation/fixtures/PriorBoxLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h b/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h
index cee39c2..0cf2ef0 100644
--- a/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h
+++ b/tests/validation/fixtures/QLSTMLayerNormalizationFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/QuantizationLayerFixture.h b/tests/validation/fixtures/QuantizationLayerFixture.h
index 085abef..4f46f99 100644
--- a/tests/validation/fixtures/QuantizationLayerFixture.h
+++ b/tests/validation/fixtures/QuantizationLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/RNNLayerFixture.h b/tests/validation/fixtures/RNNLayerFixture.h
index 2645116..1668e94 100644
--- a/tests/validation/fixtures/RNNLayerFixture.h
+++ b/tests/validation/fixtures/RNNLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ROIAlignLayerFixture.h b/tests/validation/fixtures/ROIAlignLayerFixture.h
index e4470c9..c631c24 100644
--- a/tests/validation/fixtures/ROIAlignLayerFixture.h
+++ b/tests/validation/fixtures/ROIAlignLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/RangeFixture.h b/tests/validation/fixtures/RangeFixture.h
index 4862069..604007d 100644
--- a/tests/validation/fixtures/RangeFixture.h
+++ b/tests/validation/fixtures/RangeFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ReduceMeanFixture.h b/tests/validation/fixtures/ReduceMeanFixture.h
index 44bb9fc..d102921 100644
--- a/tests/validation/fixtures/ReduceMeanFixture.h
+++ b/tests/validation/fixtures/ReduceMeanFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ReductionOperationFixture.h b/tests/validation/fixtures/ReductionOperationFixture.h
index a93bf49..3fb8544 100644
--- a/tests/validation/fixtures/ReductionOperationFixture.h
+++ b/tests/validation/fixtures/ReductionOperationFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/RemapFixture.h b/tests/validation/fixtures/RemapFixture.h
index 78b3015..e851cdb 100644
--- a/tests/validation/fixtures/RemapFixture.h
+++ b/tests/validation/fixtures/RemapFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ReorgLayerFixture.h b/tests/validation/fixtures/ReorgLayerFixture.h
index 3300e0d..6308022 100644
--- a/tests/validation/fixtures/ReorgLayerFixture.h
+++ b/tests/validation/fixtures/ReorgLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ReshapeLayerFixture.h b/tests/validation/fixtures/ReshapeLayerFixture.h
index 22f5b17..a89a947 100644
--- a/tests/validation/fixtures/ReshapeLayerFixture.h
+++ b/tests/validation/fixtures/ReshapeLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ReverseFixture.h b/tests/validation/fixtures/ReverseFixture.h
index ed5253d..4982cae 100644
--- a/tests/validation/fixtures/ReverseFixture.h
+++ b/tests/validation/fixtures/ReverseFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ScaleFixture.h b/tests/validation/fixtures/ScaleFixture.h
index e3846ed..e2ed3ab 100644
--- a/tests/validation/fixtures/ScaleFixture.h
+++ b/tests/validation/fixtures/ScaleFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,9 +48,6 @@
     void setup(TensorShape shape, DataType data_type, QuantizationInfo quantization_info, DataLayout data_layout, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy,
                bool align_corners)
     {
-        constexpr float max_width  = 8192.0f;
-        constexpr float max_height = 6384.0f;
-
         _shape             = shape;
         _policy            = policy;
         _border_mode       = border_mode;
@@ -59,41 +56,45 @@
         _quantization_info = quantization_info;
         _align_corners     = align_corners;
 
-        std::mt19937                          generator(library->seed());
-        std::uniform_real_distribution<float> distribution_float(0.25, 3);
-        float                                 scale_x = distribution_float(generator);
-        float                                 scale_y = distribution_float(generator);
+        generate_scale(shape);
 
-        const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-        const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-        scale_x = ((shape[idx_width] * scale_x) > max_width) ? (max_width / shape[idx_width]) : scale_x;
-        scale_y = ((shape[idx_height] * scale_y) > max_height) ? (max_height / shape[idx_height]) : scale_y;
-
-        const bool align_corners_a = policy == InterpolationPolicy::BILINEAR
-                                     && sampling_policy == SamplingPolicy::TOP_LEFT
-                                     && align_corners;
-
-        if(align_corners_a)
-        {
-            /* When align_corners = true is used for bilinear, both width and height
-             * of output should be > 1 to avoid overflow during computation otherwise
-             * it fails while checking argument values.
-             */
-            constexpr float min_width  = 2.f;
-            constexpr float min_height = 2.f;
-            scale_x                    = ((shape[idx_width] * scale_x) < min_width) ? (min_width / shape[idx_width]) : scale_x;
-            scale_y                    = ((shape[idx_height] * scale_y) < min_height) ? (min_height / shape[idx_height]) : scale_y;
-        }
-
+        std::mt19937                           generator(library->seed());
         std::uniform_int_distribution<uint8_t> distribution_u8(0, 255);
-        T                                      constant_border_value = static_cast<T>(distribution_u8(generator));
+        _constant_border_value = static_cast<T>(distribution_u8(generator));
 
-        _target    = compute_target(shape, data_layout, scale_x, scale_y, policy, border_mode, constant_border_value, sampling_policy, quantization_info);
-        _reference = compute_reference(shape, scale_x, scale_y, policy, border_mode, constant_border_value, sampling_policy, quantization_info);
+        _target    = compute_target(shape, data_layout);
+        _reference = compute_reference(shape);
     }
 
 protected:
+    void generate_scale(const TensorShape &shape)
+    {
+        static constexpr float _min_scale{ 0.25f };
+        static constexpr float _max_scale{ 3.f };
+
+        constexpr float max_width{ 8192.0f };
+        constexpr float max_height{ 6384.0f };
+        const float     min_width{ 1.f };
+        const float     min_height{ 1.f };
+
+        std::mt19937                          generator(library->seed());
+        std::uniform_real_distribution<float> distribution_float(_min_scale, _max_scale);
+
+        auto generate = [&](size_t input_size, float min_output, float max_output) -> float
+        {
+            const float generated_scale = distribution_float(generator);
+            const float output_size     = utility::clamp(static_cast<float>(input_size) * generated_scale, min_output, max_output);
+            return output_size / input_size;
+        };
+
+        // Input shape is always given in NCHW layout. NHWC is dealt by permute in compute_target()
+        const int idx_width  = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::WIDTH);
+        const int idx_height = get_data_layout_dimension_index(DataLayout::NCHW, DataLayoutDimension::HEIGHT);
+
+        _scale_x = generate(shape[idx_width], min_width, max_width);
+        _scale_y = generate(shape[idx_height], min_height, max_height);
+    }
+
     template <typename U>
     void fill(U &&tensor)
     {
@@ -114,9 +115,7 @@
         }
     }
 
-    TensorType compute_target(TensorShape shape, DataLayout data_layout, const float scale_x, const float scale_y,
-                              InterpolationPolicy policy, BorderMode border_mode, T constant_border_value, SamplingPolicy sampling_policy,
-                              QuantizationInfo quantization_info)
+    TensorType compute_target(TensorShape shape, DataLayout data_layout)
     {
         // Change shape in case of NHWC.
         if(data_layout == DataLayout::NHWC)
@@ -125,20 +124,20 @@
         }
 
         // Create tensors
-        TensorType src = create_tensor<TensorType>(shape, _data_type, 1, quantization_info, data_layout);
+        TensorType src = create_tensor<TensorType>(shape, _data_type, 1, _quantization_info, data_layout);
 
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
         TensorShape shape_scaled(shape);
-        shape_scaled.set(idx_width, shape[idx_width] * scale_x);
-        shape_scaled.set(idx_height, shape[idx_height] * scale_y);
-        TensorType dst = create_tensor<TensorType>(shape_scaled, _data_type, 1, quantization_info, data_layout);
+        shape_scaled.set(idx_width, shape[idx_width] * _scale_x, /* apply_dim_correction = */ false);
+        shape_scaled.set(idx_height, shape[idx_height] * _scale_y, /* apply_dim_correction = */ false);
+        TensorType dst = create_tensor<TensorType>(shape_scaled, _data_type, 1, _quantization_info, data_layout);
 
         // Create and configure function
         FunctionType scale;
 
-        scale.configure(&src, &dst, policy, border_mode, constant_border_value, sampling_policy, /* use_padding */ true, _align_corners);
+        scale.configure(&src, &dst, ScaleKernelInfo{ _policy, _border_mode, _constant_border_value, _sampling_policy, /* use_padding */ true, _align_corners });
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -158,17 +157,15 @@
         return dst;
     }
 
-    SimpleTensor<T> compute_reference(const TensorShape &shape, const float scale_x, const float scale_y,
-                                      InterpolationPolicy policy, BorderMode border_mode, T constant_border_value, SamplingPolicy sampling_policy,
-                                      QuantizationInfo quantization_info)
+    SimpleTensor<T> compute_reference(const TensorShape &shape)
     {
         // Create reference
-        SimpleTensor<T> src{ shape, _data_type, 1, quantization_info };
+        SimpleTensor<T> src{ shape, _data_type, 1, _quantization_info };
 
         // Fill reference
         fill(src);
 
-        return reference::scale<T>(src, scale_x, scale_y, policy, border_mode, constant_border_value, sampling_policy, /* ceil_policy_scale */ false, _align_corners);
+        return reference::scale<T>(src, _scale_x, _scale_y, _policy, _border_mode, _constant_border_value, _sampling_policy, /* ceil_policy_scale */ false, _align_corners);
     }
 
     TensorType          _target{};
@@ -176,10 +173,13 @@
     TensorShape         _shape{};
     InterpolationPolicy _policy{};
     BorderMode          _border_mode{};
+    T                   _constant_border_value{};
     SamplingPolicy      _sampling_policy{};
     DataType            _data_type{};
     QuantizationInfo    _quantization_info{};
     bool                _align_corners{ false };
+    float               _scale_x{ 1.f };
+    float               _scale_y{ 1.f };
 };
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T>

diff --git a/tests/validation/fixtures/ScharrFixture.h b/tests/validation/fixtures/ScharrFixture.h
index 36b8e98..204ffc6 100644
--- a/tests/validation/fixtures/ScharrFixture.h
+++ b/tests/validation/fixtures/ScharrFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/SelectFixture.h b/tests/validation/fixtures/SelectFixture.h
index 2ddc97a..96a7c86 100644
--- a/tests/validation/fixtures/SelectFixture.h
+++ b/tests/validation/fixtures/SelectFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/SliceOperationsFixtures.h b/tests/validation/fixtures/SliceOperationsFixtures.h
index df016d5..c1e046e 100644
--- a/tests/validation/fixtures/SliceOperationsFixtures.h
+++ b/tests/validation/fixtures/SliceOperationsFixtures.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/SobelFixture.h b/tests/validation/fixtures/SobelFixture.h
index 2e065e6..61a6a80 100644
--- a/tests/validation/fixtures/SobelFixture.h
+++ b/tests/validation/fixtures/SobelFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/SoftmaxLayerFixture.h b/tests/validation/fixtures/SoftmaxLayerFixture.h
index aeff777..29a3ed2 100644
--- a/tests/validation/fixtures/SoftmaxLayerFixture.h
+++ b/tests/validation/fixtures/SoftmaxLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/SpaceToBatchFixture.h b/tests/validation/fixtures/SpaceToBatchFixture.h
index d88ecb9..c4076e6 100644
--- a/tests/validation/fixtures/SpaceToBatchFixture.h
+++ b/tests/validation/fixtures/SpaceToBatchFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/SpaceToDepthFixture.h b/tests/validation/fixtures/SpaceToDepthFixture.h
index 170fdfa..24ae020 100644
--- a/tests/validation/fixtures/SpaceToDepthFixture.h
+++ b/tests/validation/fixtures/SpaceToDepthFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/SplitFixture.h b/tests/validation/fixtures/SplitFixture.h
index d2336ab..03ff41e 100644
--- a/tests/validation/fixtures/SplitFixture.h
+++ b/tests/validation/fixtures/SplitFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/StackLayerFixture.h b/tests/validation/fixtures/StackLayerFixture.h
index cf055b5..7bf63a3 100644
--- a/tests/validation/fixtures/StackLayerFixture.h
+++ b/tests/validation/fixtures/StackLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/TableLookupFixture.h b/tests/validation/fixtures/TableLookupFixture.h
index 6886c7e..a50c9fb 100644
--- a/tests/validation/fixtures/TableLookupFixture.h
+++ b/tests/validation/fixtures/TableLookupFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/ThresholdFixture.h b/tests/validation/fixtures/ThresholdFixture.h
index 9a92175..038c296 100644
--- a/tests/validation/fixtures/ThresholdFixture.h
+++ b/tests/validation/fixtures/ThresholdFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,7 +65,7 @@
 
         // Create and configure function
         FunctionType thrsh;
-        thrsh.configure(&src, &dst, threshold, false_value, true_value, type, upper);
+        thrsh.configure(&src, &dst, ThresholdKernelInfo(threshold, false_value, true_value, type, upper));
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);

diff --git a/tests/validation/fixtures/TileFixture.h b/tests/validation/fixtures/TileFixture.h
index cb70a6c..0dfcc33 100644
--- a/tests/validation/fixtures/TileFixture.h
+++ b/tests/validation/fixtures/TileFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/TransposeFixture.h b/tests/validation/fixtures/TransposeFixture.h
index c798162..757e6c3 100644
--- a/tests/validation/fixtures/TransposeFixture.h
+++ b/tests/validation/fixtures/TransposeFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/UNIT/DynamicTensorFixture.h b/tests/validation/fixtures/UNIT/DynamicTensorFixture.h
index 8638050..74e62fb 100644
--- a/tests/validation/fixtures/UNIT/DynamicTensorFixture.h
+++ b/tests/validation/fixtures/UNIT/DynamicTensorFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/UNIT/MemoryManagerFixture.h b/tests/validation/fixtures/UNIT/MemoryManagerFixture.h
index 01f9092..14f22a8 100644
--- a/tests/validation/fixtures/UNIT/MemoryManagerFixture.h
+++ b/tests/validation/fixtures/UNIT/MemoryManagerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/UNIT/WeightsRetentionFixture.h b/tests/validation/fixtures/UNIT/WeightsRetentionFixture.h
index b17c003..36d338d 100644
--- a/tests/validation/fixtures/UNIT/WeightsRetentionFixture.h
+++ b/tests/validation/fixtures/UNIT/WeightsRetentionFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/UnstackFixture.h b/tests/validation/fixtures/UnstackFixture.h
index f20a128..53c79e1 100644
--- a/tests/validation/fixtures/UnstackFixture.h
+++ b/tests/validation/fixtures/UnstackFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/UpsampleLayerFixture.h b/tests/validation/fixtures/UpsampleLayerFixture.h
index e46e19a..f7ec91f 100644
--- a/tests/validation/fixtures/UpsampleLayerFixture.h
+++ b/tests/validation/fixtures/UpsampleLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/WarpAffineFixture.h b/tests/validation/fixtures/WarpAffineFixture.h
index 3cbf86f..014d662 100644
--- a/tests/validation/fixtures/WarpAffineFixture.h
+++ b/tests/validation/fixtures/WarpAffineFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/WarpPerspectiveFixture.h b/tests/validation/fixtures/WarpPerspectiveFixture.h
index aa84946..40ae3b9 100644
--- a/tests/validation/fixtures/WarpPerspectiveFixture.h
+++ b/tests/validation/fixtures/WarpPerspectiveFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/WeightsReshapeFixture.h b/tests/validation/fixtures/WeightsReshapeFixture.h
index 06765f6..5c17b53 100644
--- a/tests/validation/fixtures/WeightsReshapeFixture.h
+++ b/tests/validation/fixtures/WeightsReshapeFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
index 9c2df9e..410c2a5 100644
--- a/tests/validation/fixtures/WinogradConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/WinogradConvolutionLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/fixtures/YOLOLayerFixture.h b/tests/validation/fixtures/YOLOLayerFixture.h
index a3842e1..d806c7b 100644
--- a/tests/validation/fixtures/YOLOLayerFixture.h
+++ b/tests/validation/fixtures/YOLOLayerFixture.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/AbsoluteDifference.cpp b/tests/validation/reference/AbsoluteDifference.cpp
index ea7685b..a267b4b 100644
--- a/tests/validation/reference/AbsoluteDifference.cpp
+++ b/tests/validation/reference/AbsoluteDifference.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/AbsoluteDifference.h b/tests/validation/reference/AbsoluteDifference.h
index 77849f8..7a2989b 100644
--- a/tests/validation/reference/AbsoluteDifference.h
+++ b/tests/validation/reference/AbsoluteDifference.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Accumulate.cpp b/tests/validation/reference/Accumulate.cpp
index 2758577..67754f0 100644
--- a/tests/validation/reference/Accumulate.cpp
+++ b/tests/validation/reference/Accumulate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Accumulate.h b/tests/validation/reference/Accumulate.h
index 2113143..70b2fdb 100644
--- a/tests/validation/reference/Accumulate.h
+++ b/tests/validation/reference/Accumulate.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ActivationLayer.cpp b/tests/validation/reference/ActivationLayer.cpp
index 4aeefaa..664b969 100644
--- a/tests/validation/reference/ActivationLayer.cpp
+++ b/tests/validation/reference/ActivationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ActivationLayer.h b/tests/validation/reference/ActivationLayer.h
index 4585a9d..8aad1af 100644
--- a/tests/validation/reference/ActivationLayer.h
+++ b/tests/validation/reference/ActivationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ArithmeticDivision.cpp b/tests/validation/reference/ArithmeticDivision.cpp
index f86ee5e..414d9b9 100644
--- a/tests/validation/reference/ArithmeticDivision.cpp
+++ b/tests/validation/reference/ArithmeticDivision.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ArithmeticDivision.h b/tests/validation/reference/ArithmeticDivision.h
index f47c59f..0363ffd 100644
--- a/tests/validation/reference/ArithmeticDivision.h
+++ b/tests/validation/reference/ArithmeticDivision.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ArithmeticOperations.cpp b/tests/validation/reference/ArithmeticOperations.cpp
index fd32f45..ecf02f7 100644
--- a/tests/validation/reference/ArithmeticOperations.cpp
+++ b/tests/validation/reference/ArithmeticOperations.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -180,6 +180,7 @@
     }
 }
 
+template SimpleTensor<int32_t> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<int32_t> &src1, const SimpleTensor<int32_t> &src2, SimpleTensor<int32_t> &dst, ConvertPolicy convert_policy);
 template SimpleTensor<half> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, SimpleTensor<half> &dst, ConvertPolicy convert_policy);
 template SimpleTensor<float> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<float> &src1, const SimpleTensor<float> &src2, SimpleTensor<float> &dst, ConvertPolicy convert_policy);
 
@@ -193,6 +194,7 @@
     return dst;
 }
 
+template SimpleTensor<int32_t> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<int32_t> &src1, const SimpleTensor<int32_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<int16_t> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<int16_t> &src1, const SimpleTensor<int16_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<int8_t> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, DataType dst_data_type, ConvertPolicy convert_policy);
 template SimpleTensor<half> arithmetic_operation(ArithmeticOperation op, const SimpleTensor<half> &src1, const SimpleTensor<half> &src2, DataType dst_data_type, ConvertPolicy convert_policy);

diff --git a/tests/validation/reference/ArithmeticOperations.h b/tests/validation/reference/ArithmeticOperations.h
index 53d4bbf..e3a4f73 100644
--- a/tests/validation/reference/ArithmeticOperations.h
+++ b/tests/validation/reference/ArithmeticOperations.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BatchNormalizationLayer.cpp b/tests/validation/reference/BatchNormalizationLayer.cpp
index 6623b22..3406781 100644
--- a/tests/validation/reference/BatchNormalizationLayer.cpp
+++ b/tests/validation/reference/BatchNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BatchNormalizationLayer.h b/tests/validation/reference/BatchNormalizationLayer.h
index 89b18ff..13a5eb2 100644
--- a/tests/validation/reference/BatchNormalizationLayer.h
+++ b/tests/validation/reference/BatchNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BatchToSpaceLayer.cpp b/tests/validation/reference/BatchToSpaceLayer.cpp
index 662a707..404ee73 100644
--- a/tests/validation/reference/BatchToSpaceLayer.cpp
+++ b/tests/validation/reference/BatchToSpaceLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BatchToSpaceLayer.h b/tests/validation/reference/BatchToSpaceLayer.h
index da8c1ef..52556cb 100644
--- a/tests/validation/reference/BatchToSpaceLayer.h
+++ b/tests/validation/reference/BatchToSpaceLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BitwiseAnd.cpp b/tests/validation/reference/BitwiseAnd.cpp
index 356c27e..922f75a 100644
--- a/tests/validation/reference/BitwiseAnd.cpp
+++ b/tests/validation/reference/BitwiseAnd.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BitwiseAnd.h b/tests/validation/reference/BitwiseAnd.h
index def8669..f78be73 100644
--- a/tests/validation/reference/BitwiseAnd.h
+++ b/tests/validation/reference/BitwiseAnd.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BitwiseNot.cpp b/tests/validation/reference/BitwiseNot.cpp
index 03578a3..d68a010 100644
--- a/tests/validation/reference/BitwiseNot.cpp
+++ b/tests/validation/reference/BitwiseNot.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BitwiseNot.h b/tests/validation/reference/BitwiseNot.h
index a617853..f4294ee 100644
--- a/tests/validation/reference/BitwiseNot.h
+++ b/tests/validation/reference/BitwiseNot.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BitwiseOr.cpp b/tests/validation/reference/BitwiseOr.cpp
index 11c0a93..d38ae46 100644
--- a/tests/validation/reference/BitwiseOr.cpp
+++ b/tests/validation/reference/BitwiseOr.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BitwiseOr.h b/tests/validation/reference/BitwiseOr.h
index 9e9c3e1..9f5c7cb 100644
--- a/tests/validation/reference/BitwiseOr.h
+++ b/tests/validation/reference/BitwiseOr.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BitwiseXor.cpp b/tests/validation/reference/BitwiseXor.cpp
index afae032..db5c953 100644
--- a/tests/validation/reference/BitwiseXor.cpp
+++ b/tests/validation/reference/BitwiseXor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BitwiseXor.h b/tests/validation/reference/BitwiseXor.h
index 68478d0..c1a4d6f 100644
--- a/tests/validation/reference/BitwiseXor.h
+++ b/tests/validation/reference/BitwiseXor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BoundingBoxTransform.cpp b/tests/validation/reference/BoundingBoxTransform.cpp
index 89182f1..2abadd0 100644
--- a/tests/validation/reference/BoundingBoxTransform.cpp
+++ b/tests/validation/reference/BoundingBoxTransform.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/BoundingBoxTransform.h b/tests/validation/reference/BoundingBoxTransform.h
index 6900f67..df243ef 100644
--- a/tests/validation/reference/BoundingBoxTransform.h
+++ b/tests/validation/reference/BoundingBoxTransform.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Box3x3.cpp b/tests/validation/reference/Box3x3.cpp
index 7ea3f1f..ccc7f1b 100644
--- a/tests/validation/reference/Box3x3.cpp
+++ b/tests/validation/reference/Box3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Box3x3.h b/tests/validation/reference/Box3x3.h
index 64820dc..f377f28 100644
--- a/tests/validation/reference/Box3x3.h
+++ b/tests/validation/reference/Box3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/CannyEdgeDetector.cpp b/tests/validation/reference/CannyEdgeDetector.cpp
index a952dde..aa2351d 100644
--- a/tests/validation/reference/CannyEdgeDetector.cpp
+++ b/tests/validation/reference/CannyEdgeDetector.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/CannyEdgeDetector.h b/tests/validation/reference/CannyEdgeDetector.h
index 664eab7..e05895a 100644
--- a/tests/validation/reference/CannyEdgeDetector.h
+++ b/tests/validation/reference/CannyEdgeDetector.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ChannelCombine.cpp b/tests/validation/reference/ChannelCombine.cpp
index 2380b58..dcd4cf5 100644
--- a/tests/validation/reference/ChannelCombine.cpp
+++ b/tests/validation/reference/ChannelCombine.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ChannelCombine.h b/tests/validation/reference/ChannelCombine.h
index 7f870aa..315e2d4 100644
--- a/tests/validation/reference/ChannelCombine.h
+++ b/tests/validation/reference/ChannelCombine.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ChannelExtract.cpp b/tests/validation/reference/ChannelExtract.cpp
index 75d0a00..8674510 100644
--- a/tests/validation/reference/ChannelExtract.cpp
+++ b/tests/validation/reference/ChannelExtract.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ChannelExtract.h b/tests/validation/reference/ChannelExtract.h
index fd34908..ce1e673 100644
--- a/tests/validation/reference/ChannelExtract.h
+++ b/tests/validation/reference/ChannelExtract.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ChannelShuffle.cpp b/tests/validation/reference/ChannelShuffle.cpp
index 39d89e9..89ef080 100644
--- a/tests/validation/reference/ChannelShuffle.cpp
+++ b/tests/validation/reference/ChannelShuffle.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ChannelShuffle.h b/tests/validation/reference/ChannelShuffle.h
index 7c72977..d83be36 100644
--- a/tests/validation/reference/ChannelShuffle.h
+++ b/tests/validation/reference/ChannelShuffle.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Col2Im.cpp b/tests/validation/reference/Col2Im.cpp
index f42582b..04c4128 100644
--- a/tests/validation/reference/Col2Im.cpp
+++ b/tests/validation/reference/Col2Im.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Col2Im.h b/tests/validation/reference/Col2Im.h
index 6ca76d0..57e93d9 100644
--- a/tests/validation/reference/Col2Im.h
+++ b/tests/validation/reference/Col2Im.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ColorConvert.cpp b/tests/validation/reference/ColorConvert.cpp
index a759594..c6a4630 100644
--- a/tests/validation/reference/ColorConvert.cpp
+++ b/tests/validation/reference/ColorConvert.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ColorConvert.h b/tests/validation/reference/ColorConvert.h
index 75efeb2..28776cb 100644
--- a/tests/validation/reference/ColorConvert.h
+++ b/tests/validation/reference/ColorConvert.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ColorConvertHelper.h b/tests/validation/reference/ColorConvertHelper.h
index f3865dd..100b4dc 100644
--- a/tests/validation/reference/ColorConvertHelper.h
+++ b/tests/validation/reference/ColorConvertHelper.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Comparisons.cpp b/tests/validation/reference/Comparisons.cpp
index 2313d9b..6b18e90 100644
--- a/tests/validation/reference/Comparisons.cpp
+++ b/tests/validation/reference/Comparisons.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Comparisons.h b/tests/validation/reference/Comparisons.h
index 262f4b7..f0867a8 100644
--- a/tests/validation/reference/Comparisons.h
+++ b/tests/validation/reference/Comparisons.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ComputeAllAnchors.cpp b/tests/validation/reference/ComputeAllAnchors.cpp
index 9654da2..8715a5a 100644
--- a/tests/validation/reference/ComputeAllAnchors.cpp
+++ b/tests/validation/reference/ComputeAllAnchors.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ComputeAllAnchors.h b/tests/validation/reference/ComputeAllAnchors.h
index 7dfa268..838d780 100644
--- a/tests/validation/reference/ComputeAllAnchors.h
+++ b/tests/validation/reference/ComputeAllAnchors.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ConcatenateLayer.cpp b/tests/validation/reference/ConcatenateLayer.cpp
index 266dae1..8910f48 100644
--- a/tests/validation/reference/ConcatenateLayer.cpp
+++ b/tests/validation/reference/ConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ConcatenateLayer.h b/tests/validation/reference/ConcatenateLayer.h
index a85a66e..b773221 100644
--- a/tests/validation/reference/ConcatenateLayer.h
+++ b/tests/validation/reference/ConcatenateLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ConvertFullyConnectedWeights.cpp b/tests/validation/reference/ConvertFullyConnectedWeights.cpp
index 710644a..0bb65fc 100644
--- a/tests/validation/reference/ConvertFullyConnectedWeights.cpp
+++ b/tests/validation/reference/ConvertFullyConnectedWeights.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ConvertFullyConnectedWeights.h b/tests/validation/reference/ConvertFullyConnectedWeights.h
index daeba0e..6159873 100644
--- a/tests/validation/reference/ConvertFullyConnectedWeights.h
+++ b/tests/validation/reference/ConvertFullyConnectedWeights.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Convolution.cpp b/tests/validation/reference/Convolution.cpp
index ad93b30..0a4e043 100644
--- a/tests/validation/reference/Convolution.cpp
+++ b/tests/validation/reference/Convolution.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Convolution.h b/tests/validation/reference/Convolution.h
index ae6650e..174ce7e 100644
--- a/tests/validation/reference/Convolution.h
+++ b/tests/validation/reference/Convolution.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Convolution3d.h b/tests/validation/reference/Convolution3d.h
index 6168f10..03a2f53 100644
--- a/tests/validation/reference/Convolution3d.h
+++ b/tests/validation/reference/Convolution3d.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ConvolutionLayer.cpp b/tests/validation/reference/ConvolutionLayer.cpp
index 9675901..8eb8553 100644
--- a/tests/validation/reference/ConvolutionLayer.cpp
+++ b/tests/validation/reference/ConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ConvolutionLayer.h b/tests/validation/reference/ConvolutionLayer.h
index 61733d5..40d235a 100644
--- a/tests/validation/reference/ConvolutionLayer.h
+++ b/tests/validation/reference/ConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Copy.cpp b/tests/validation/reference/Copy.cpp
index dc519a4..181d85e 100644
--- a/tests/validation/reference/Copy.cpp
+++ b/tests/validation/reference/Copy.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Copy.h b/tests/validation/reference/Copy.h
index 10a3a61..1998b9e 100644
--- a/tests/validation/reference/Copy.h
+++ b/tests/validation/reference/Copy.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/CropResize.cpp b/tests/validation/reference/CropResize.cpp
index 68ee455..4126790 100644
--- a/tests/validation/reference/CropResize.cpp
+++ b/tests/validation/reference/CropResize.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -193,6 +193,8 @@
                                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
 template SimpleTensor<float> crop_and_resize(const SimpleTensor<half> &src, const SimpleTensor<float> &boxes, SimpleTensor<int32_t> box_ind,
                                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
+template SimpleTensor<float> crop_and_resize(const SimpleTensor<uint8_t> &src, const SimpleTensor<float> &boxes, SimpleTensor<int32_t> box_ind,
+                                             Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/CropResize.h b/tests/validation/reference/CropResize.h
index 6c07306..2f88ad6 100644
--- a/tests/validation/reference/CropResize.h
+++ b/tests/validation/reference/CropResize.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DFT.cpp b/tests/validation/reference/DFT.cpp
index 7221312..1f746ea 100644
--- a/tests/validation/reference/DFT.cpp
+++ b/tests/validation/reference/DFT.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DFT.h b/tests/validation/reference/DFT.h
index 997343c..5b12e49 100644
--- a/tests/validation/reference/DFT.h
+++ b/tests/validation/reference/DFT.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DeconvolutionLayer.cpp b/tests/validation/reference/DeconvolutionLayer.cpp
index 3cfbfae..8918285 100644
--- a/tests/validation/reference/DeconvolutionLayer.cpp
+++ b/tests/validation/reference/DeconvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DeconvolutionLayer.h b/tests/validation/reference/DeconvolutionLayer.h
index fff529a..07b9a53 100644
--- a/tests/validation/reference/DeconvolutionLayer.h
+++ b/tests/validation/reference/DeconvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DepthConcatenateLayer.cpp b/tests/validation/reference/DepthConcatenateLayer.cpp
index 2c93e70..54226a4 100644
--- a/tests/validation/reference/DepthConcatenateLayer.cpp
+++ b/tests/validation/reference/DepthConcatenateLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DepthConcatenateLayer.h b/tests/validation/reference/DepthConcatenateLayer.h
index 264d8d0..56f9aba 100644
--- a/tests/validation/reference/DepthConcatenateLayer.h
+++ b/tests/validation/reference/DepthConcatenateLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DepthConvertLayer.cpp b/tests/validation/reference/DepthConvertLayer.cpp
index 9c6a9aa..30b7e57 100644
--- a/tests/validation/reference/DepthConvertLayer.cpp
+++ b/tests/validation/reference/DepthConvertLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DepthConvertLayer.h b/tests/validation/reference/DepthConvertLayer.h
index 9513d07..a9c7500 100644
--- a/tests/validation/reference/DepthConvertLayer.h
+++ b/tests/validation/reference/DepthConvertLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DepthToSpaceLayer.cpp b/tests/validation/reference/DepthToSpaceLayer.cpp
index e2329ed..29a3075 100644
--- a/tests/validation/reference/DepthToSpaceLayer.cpp
+++ b/tests/validation/reference/DepthToSpaceLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DepthToSpaceLayer.h b/tests/validation/reference/DepthToSpaceLayer.h
index 6c83342..6c63a11 100644
--- a/tests/validation/reference/DepthToSpaceLayer.h
+++ b/tests/validation/reference/DepthToSpaceLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.cpp b/tests/validation/reference/DepthwiseConvolutionLayer.cpp
index 7bba98a..e24b5b4 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DepthwiseConvolutionLayer.h b/tests/validation/reference/DepthwiseConvolutionLayer.h
index d0dabb9..1b93acf 100644
--- a/tests/validation/reference/DepthwiseConvolutionLayer.h
+++ b/tests/validation/reference/DepthwiseConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DepthwiseSeparableConvolutionLayer.h b/tests/validation/reference/DepthwiseSeparableConvolutionLayer.h
index 6165faf..b831f6a 100644
--- a/tests/validation/reference/DepthwiseSeparableConvolutionLayer.h
+++ b/tests/validation/reference/DepthwiseSeparableConvolutionLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DequantizationLayer.cpp b/tests/validation/reference/DequantizationLayer.cpp
index 7dec988..64a89aa 100644
--- a/tests/validation/reference/DequantizationLayer.cpp
+++ b/tests/validation/reference/DequantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/DequantizationLayer.h b/tests/validation/reference/DequantizationLayer.h
index 0bf10a2..a0a83ae 100644
--- a/tests/validation/reference/DequantizationLayer.h
+++ b/tests/validation/reference/DequantizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Derivative.cpp b/tests/validation/reference/Derivative.cpp
index f4c2934..c65ebca 100644
--- a/tests/validation/reference/Derivative.cpp
+++ b/tests/validation/reference/Derivative.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Derivative.h b/tests/validation/reference/Derivative.h
index 829d2ee..16f764e 100644
--- a/tests/validation/reference/Derivative.h
+++ b/tests/validation/reference/Derivative.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Dilate.cpp b/tests/validation/reference/Dilate.cpp
index cba9af1..be8ccb6 100644
--- a/tests/validation/reference/Dilate.cpp
+++ b/tests/validation/reference/Dilate.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Dilate.h b/tests/validation/reference/Dilate.h
index 6755f02..640bc9d 100644
--- a/tests/validation/reference/Dilate.h
+++ b/tests/validation/reference/Dilate.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ElementWiseUnary.cpp b/tests/validation/reference/ElementWiseUnary.cpp
index f1bb7c7..1d46ed6 100644
--- a/tests/validation/reference/ElementWiseUnary.cpp
+++ b/tests/validation/reference/ElementWiseUnary.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ElementWiseUnary.h b/tests/validation/reference/ElementWiseUnary.h
index a8bed19..be4a229 100644
--- a/tests/validation/reference/ElementWiseUnary.h
+++ b/tests/validation/reference/ElementWiseUnary.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ElementwiseOperations.cpp b/tests/validation/reference/ElementwiseOperations.cpp
index 2a76b31..aab9d9d 100644
--- a/tests/validation/reference/ElementwiseOperations.cpp
+++ b/tests/validation/reference/ElementwiseOperations.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ElementwiseOperations.h b/tests/validation/reference/ElementwiseOperations.h
index 912fe1a..79d0042 100644
--- a/tests/validation/reference/ElementwiseOperations.h
+++ b/tests/validation/reference/ElementwiseOperations.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/EqualizeHistogram.cpp b/tests/validation/reference/EqualizeHistogram.cpp
index 34e7c39..8a957d7 100644
--- a/tests/validation/reference/EqualizeHistogram.cpp
+++ b/tests/validation/reference/EqualizeHistogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/EqualizeHistogram.h b/tests/validation/reference/EqualizeHistogram.h
index 5d22cbb..c79b213 100644
--- a/tests/validation/reference/EqualizeHistogram.h
+++ b/tests/validation/reference/EqualizeHistogram.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Erode.cpp b/tests/validation/reference/Erode.cpp
index 0964c3d..0413eb5 100644
--- a/tests/validation/reference/Erode.cpp
+++ b/tests/validation/reference/Erode.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Erode.h b/tests/validation/reference/Erode.h
index 4ec06d7..a809be8 100644
--- a/tests/validation/reference/Erode.h
+++ b/tests/validation/reference/Erode.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/FastCorners.cpp b/tests/validation/reference/FastCorners.cpp
index 32b9115..25fbf1b 100644
--- a/tests/validation/reference/FastCorners.cpp
+++ b/tests/validation/reference/FastCorners.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/FastCorners.h b/tests/validation/reference/FastCorners.h
index 00ee736..2c4506d 100644
--- a/tests/validation/reference/FastCorners.h
+++ b/tests/validation/reference/FastCorners.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/FlattenLayer.cpp b/tests/validation/reference/FlattenLayer.cpp
index 381ce37..145b41a 100644
--- a/tests/validation/reference/FlattenLayer.cpp
+++ b/tests/validation/reference/FlattenLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/FlattenLayer.h b/tests/validation/reference/FlattenLayer.h
index 2b8083a..212bb08 100644
--- a/tests/validation/reference/FlattenLayer.h
+++ b/tests/validation/reference/FlattenLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Floor.cpp b/tests/validation/reference/Floor.cpp
index 21fa1c9..8a3fb24 100644
--- a/tests/validation/reference/Floor.cpp
+++ b/tests/validation/reference/Floor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Floor.h b/tests/validation/reference/Floor.h
index e0f09c9..b197f15 100644
--- a/tests/validation/reference/Floor.h
+++ b/tests/validation/reference/Floor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/FullyConnectedLayer.cpp b/tests/validation/reference/FullyConnectedLayer.cpp
index 908c583..2133395 100644
--- a/tests/validation/reference/FullyConnectedLayer.cpp
+++ b/tests/validation/reference/FullyConnectedLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/FullyConnectedLayer.h b/tests/validation/reference/FullyConnectedLayer.h
index 0afffc3..c807e06 100644
--- a/tests/validation/reference/FullyConnectedLayer.h
+++ b/tests/validation/reference/FullyConnectedLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/FuseBatchNormalization.cpp b/tests/validation/reference/FuseBatchNormalization.cpp
index cb50038..08e94a3 100644
--- a/tests/validation/reference/FuseBatchNormalization.cpp
+++ b/tests/validation/reference/FuseBatchNormalization.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/FuseBatchNormalization.h b/tests/validation/reference/FuseBatchNormalization.h
index d741bf3..9c9f9ac 100644
--- a/tests/validation/reference/FuseBatchNormalization.h
+++ b/tests/validation/reference/FuseBatchNormalization.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index f7a1718..6b3aa39 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMM.h b/tests/validation/reference/GEMM.h
index 9d82818..5feaeda 100644
--- a/tests/validation/reference/GEMM.h
+++ b/tests/validation/reference/GEMM.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMMInterleave4x4.h b/tests/validation/reference/GEMMInterleave4x4.h
index e3d72d9..3493355 100644
--- a/tests/validation/reference/GEMMInterleave4x4.h
+++ b/tests/validation/reference/GEMMInterleave4x4.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMMInterleaveBlocked.h b/tests/validation/reference/GEMMInterleaveBlocked.h
index d649a51..0e75d2b 100644
--- a/tests/validation/reference/GEMMInterleaveBlocked.h
+++ b/tests/validation/reference/GEMMInterleaveBlocked.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp
index 85a98e4..1615b51 100644
--- a/tests/validation/reference/GEMMLowp.cpp
+++ b/tests/validation/reference/GEMMLowp.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMMLowp.h b/tests/validation/reference/GEMMLowp.h
index 5de48da..99015d7 100644
--- a/tests/validation/reference/GEMMLowp.h
+++ b/tests/validation/reference/GEMMLowp.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMMReshapeLHSMatrix.cpp b/tests/validation/reference/GEMMReshapeLHSMatrix.cpp
index f21fe50..ab50e3d 100644
--- a/tests/validation/reference/GEMMReshapeLHSMatrix.cpp
+++ b/tests/validation/reference/GEMMReshapeLHSMatrix.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMMReshapeLHSMatrix.h b/tests/validation/reference/GEMMReshapeLHSMatrix.h
index 35fff91..fefa946 100644
--- a/tests/validation/reference/GEMMReshapeLHSMatrix.h
+++ b/tests/validation/reference/GEMMReshapeLHSMatrix.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMMReshapeRHSMatrix.cpp b/tests/validation/reference/GEMMReshapeRHSMatrix.cpp
index ebb6f85..d691338 100644
--- a/tests/validation/reference/GEMMReshapeRHSMatrix.cpp
+++ b/tests/validation/reference/GEMMReshapeRHSMatrix.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMMReshapeRHSMatrix.h b/tests/validation/reference/GEMMReshapeRHSMatrix.h
index 4be0cde..89e360e 100644
--- a/tests/validation/reference/GEMMReshapeRHSMatrix.h
+++ b/tests/validation/reference/GEMMReshapeRHSMatrix.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GEMMTranspose1xW.h b/tests/validation/reference/GEMMTranspose1xW.h
index 6ec70b1..14add1e 100644
--- a/tests/validation/reference/GEMMTranspose1xW.h
+++ b/tests/validation/reference/GEMMTranspose1xW.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Gather.cpp b/tests/validation/reference/Gather.cpp
index ab5ea2f..93ac09c 100644
--- a/tests/validation/reference/Gather.cpp
+++ b/tests/validation/reference/Gather.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Gather.h b/tests/validation/reference/Gather.h
index 40fac10..67b7dca 100644
--- a/tests/validation/reference/Gather.h
+++ b/tests/validation/reference/Gather.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Gaussian3x3.cpp b/tests/validation/reference/Gaussian3x3.cpp
index f2ac134..2e307e8 100644
--- a/tests/validation/reference/Gaussian3x3.cpp
+++ b/tests/validation/reference/Gaussian3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Gaussian3x3.h b/tests/validation/reference/Gaussian3x3.h
index 8a9c0a3..a433db6 100644
--- a/tests/validation/reference/Gaussian3x3.h
+++ b/tests/validation/reference/Gaussian3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Gaussian5x5.cpp b/tests/validation/reference/Gaussian5x5.cpp
index 426e666..2133d89 100644
--- a/tests/validation/reference/Gaussian5x5.cpp
+++ b/tests/validation/reference/Gaussian5x5.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Gaussian5x5.h b/tests/validation/reference/Gaussian5x5.h
index 9a64417..42920bd 100644
--- a/tests/validation/reference/Gaussian5x5.h
+++ b/tests/validation/reference/Gaussian5x5.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GaussianPyramidHalf.cpp b/tests/validation/reference/GaussianPyramidHalf.cpp
index 7d5eb07..5bddd85 100644
--- a/tests/validation/reference/GaussianPyramidHalf.cpp
+++ b/tests/validation/reference/GaussianPyramidHalf.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/GaussianPyramidHalf.h b/tests/validation/reference/GaussianPyramidHalf.h
index 03a4bd4..225ef00 100644
--- a/tests/validation/reference/GaussianPyramidHalf.h
+++ b/tests/validation/reference/GaussianPyramidHalf.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/HOGDescriptor.cpp b/tests/validation/reference/HOGDescriptor.cpp
index f0f573a..e00beaf 100644
--- a/tests/validation/reference/HOGDescriptor.cpp
+++ b/tests/validation/reference/HOGDescriptor.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/HOGDescriptor.h b/tests/validation/reference/HOGDescriptor.h
index ef189ec..dffeb65 100644
--- a/tests/validation/reference/HOGDescriptor.h
+++ b/tests/validation/reference/HOGDescriptor.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/HOGDetector.cpp b/tests/validation/reference/HOGDetector.cpp
index 8ca1b0c..798c3fc 100644
--- a/tests/validation/reference/HOGDetector.cpp
+++ b/tests/validation/reference/HOGDetector.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/HOGDetector.h b/tests/validation/reference/HOGDetector.h
index 86f45b4..9809ae3 100644
--- a/tests/validation/reference/HOGDetector.h
+++ b/tests/validation/reference/HOGDetector.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/HOGMultiDetection.cpp b/tests/validation/reference/HOGMultiDetection.cpp
index 2f5e439..50d846c 100644
--- a/tests/validation/reference/HOGMultiDetection.cpp
+++ b/tests/validation/reference/HOGMultiDetection.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/HOGMultiDetection.h b/tests/validation/reference/HOGMultiDetection.h
index 2b38aae..7194af7 100644
--- a/tests/validation/reference/HOGMultiDetection.h
+++ b/tests/validation/reference/HOGMultiDetection.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/HarrisCornerDetector.cpp b/tests/validation/reference/HarrisCornerDetector.cpp
index 442f6a1..6c46b3d 100644
--- a/tests/validation/reference/HarrisCornerDetector.cpp
+++ b/tests/validation/reference/HarrisCornerDetector.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/HarrisCornerDetector.h b/tests/validation/reference/HarrisCornerDetector.h
index 590e9ff..2f46474 100644
--- a/tests/validation/reference/HarrisCornerDetector.h
+++ b/tests/validation/reference/HarrisCornerDetector.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Histogram.cpp b/tests/validation/reference/Histogram.cpp
index 594c4fb..f9c7710 100644
--- a/tests/validation/reference/Histogram.cpp
+++ b/tests/validation/reference/Histogram.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Histogram.h b/tests/validation/reference/Histogram.h
index 60fa758..5f6c7d2 100644
--- a/tests/validation/reference/Histogram.h
+++ b/tests/validation/reference/Histogram.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Im2Col.cpp b/tests/validation/reference/Im2Col.cpp
index a3dcf07..91544a4 100644
--- a/tests/validation/reference/Im2Col.cpp
+++ b/tests/validation/reference/Im2Col.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Im2Col.h b/tests/validation/reference/Im2Col.h
index 68c1c07..db0f2b4 100644
--- a/tests/validation/reference/Im2Col.h
+++ b/tests/validation/reference/Im2Col.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/InstanceNormalizationLayer.cpp b/tests/validation/reference/InstanceNormalizationLayer.cpp
index 3395497..8a5757b 100644
--- a/tests/validation/reference/InstanceNormalizationLayer.cpp
+++ b/tests/validation/reference/InstanceNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/InstanceNormalizationLayer.h b/tests/validation/reference/InstanceNormalizationLayer.h
index 5c4ee6d..5f9ea8b 100644
--- a/tests/validation/reference/InstanceNormalizationLayer.h
+++ b/tests/validation/reference/InstanceNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/IntegralImage.cpp b/tests/validation/reference/IntegralImage.cpp
index 8d07e99..0f6a750 100644
--- a/tests/validation/reference/IntegralImage.cpp
+++ b/tests/validation/reference/IntegralImage.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/IntegralImage.h b/tests/validation/reference/IntegralImage.h
index 0d2314c..2c9b96a 100644
--- a/tests/validation/reference/IntegralImage.h
+++ b/tests/validation/reference/IntegralImage.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/L2NormalizeLayer.cpp b/tests/validation/reference/L2NormalizeLayer.cpp
index 43885b2..265820d 100644
--- a/tests/validation/reference/L2NormalizeLayer.cpp
+++ b/tests/validation/reference/L2NormalizeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/L2NormalizeLayer.h b/tests/validation/reference/L2NormalizeLayer.h
index 41817c8..29af6cc 100644
--- a/tests/validation/reference/L2NormalizeLayer.h
+++ b/tests/validation/reference/L2NormalizeLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/LaplacianPyramid.cpp b/tests/validation/reference/LaplacianPyramid.cpp
index 21ddc1e..904b840 100644
--- a/tests/validation/reference/LaplacianPyramid.cpp
+++ b/tests/validation/reference/LaplacianPyramid.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/LaplacianPyramid.h b/tests/validation/reference/LaplacianPyramid.h
index d9f4ef0..0596b81 100644
--- a/tests/validation/reference/LaplacianPyramid.h
+++ b/tests/validation/reference/LaplacianPyramid.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/LaplacianReconstruct.cpp b/tests/validation/reference/LaplacianReconstruct.cpp
index ef14355..2a0fcc2 100644
--- a/tests/validation/reference/LaplacianReconstruct.cpp
+++ b/tests/validation/reference/LaplacianReconstruct.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/LaplacianReconstruct.h b/tests/validation/reference/LaplacianReconstruct.h
index 24efc1b..8820c92 100644
--- a/tests/validation/reference/LaplacianReconstruct.h
+++ b/tests/validation/reference/LaplacianReconstruct.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/LocallyConnected.cpp b/tests/validation/reference/LocallyConnected.cpp
index ecc582b..a5141f2 100644
--- a/tests/validation/reference/LocallyConnected.cpp
+++ b/tests/validation/reference/LocallyConnected.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/LocallyConnected.h b/tests/validation/reference/LocallyConnected.h
index 97df45e..c85d0e9 100644
--- a/tests/validation/reference/LocallyConnected.h
+++ b/tests/validation/reference/LocallyConnected.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/LogSoftmaxLayer.cpp b/tests/validation/reference/LogSoftmaxLayer.cpp
index edb208e..8d3b8f7 100644
--- a/tests/validation/reference/LogSoftmaxLayer.cpp
+++ b/tests/validation/reference/LogSoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,26 +35,26 @@
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis)
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
 {
-    return softmax_layer_generic<T>(src, beta, axis, true);
+    return softmax_layer_generic<T>(src, beta, reduce_end_axis, true);
 }
 
 template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type >
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis)
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
 {
     const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), true);
 
     SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float> dst_tmp = log_softmax_layer<float>(src_tmp, beta, axis);
+    SimpleTensor<float> dst_tmp = log_softmax_layer<float>(src_tmp, beta, reduce_end_axis);
     SimpleTensor<T>     dst     = convert_to_asymmetric<T>(dst_tmp, output_quantization_info);
     return dst;
 }
 
-template SimpleTensor<float> log_softmax_layer(const SimpleTensor<float> &src, float beta, int32_t axis);
-template SimpleTensor<half> log_softmax_layer(const SimpleTensor<half> &src, float beta, int32_t axis);
-template SimpleTensor<uint8_t> log_softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t axis);
-template SimpleTensor<int8_t> log_softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t axis);
+template SimpleTensor<float> log_softmax_layer(const SimpleTensor<float> &src, float beta, int32_t reduce_end_axis);
+template SimpleTensor<half> log_softmax_layer(const SimpleTensor<half> &src, float beta, int32_t reduce_end_axis);
+template SimpleTensor<uint8_t> log_softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t reduce_end_axis);
+template SimpleTensor<int8_t> log_softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t reduce_end_axis);
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/LogSoftmaxLayer.h b/tests/validation/reference/LogSoftmaxLayer.h
index 48ffdcf..db94507 100644
--- a/tests/validation/reference/LogSoftmaxLayer.h
+++ b/tests/validation/reference/LogSoftmaxLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,10 +36,10 @@
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis = -1);
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
 
 template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type = 0 >
-SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis = -1);
+SimpleTensor<T> log_softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/Magnitude.cpp b/tests/validation/reference/Magnitude.cpp
index f0002bf..390aaa5 100644
--- a/tests/validation/reference/Magnitude.cpp
+++ b/tests/validation/reference/Magnitude.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Magnitude.h b/tests/validation/reference/Magnitude.h
index 870ea28..81db27d 100644
--- a/tests/validation/reference/Magnitude.h
+++ b/tests/validation/reference/Magnitude.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/MaxUnpoolingLayer.cpp b/tests/validation/reference/MaxUnpoolingLayer.cpp
new file mode 100644
index 0000000..880018e
--- /dev/null
+++ b/tests/validation/reference/MaxUnpoolingLayer.cpp

@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "MaxUnpoolingLayer.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+using namespace arm_compute::misc::shape_calculator;
+
+template <typename T>
+SimpleTensor<T> max_unpooling_layer_internal(const SimpleTensor<T> &src, const PoolingLayerInfo &info,
+                                             const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> &indices,
+                                             TensorShape output_shape, DataLayout data_layout)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(output_qinfo);
+    ARM_COMPUTE_UNUSED(data_layout);
+    // Create reference
+    SimpleTensor<T> dst{ output_shape, src.data_type(), 1 };
+    ARM_COMPUTE_ERROR_ON(indices.shape().total_size() == 0);
+    std::fill_n(dst.data(), dst.num_elements(), 0);
+    const auto w_indices = static_cast<int>(indices.shape()[0]);
+    const auto h_indices = static_cast<int>(indices.shape()[1]);
+    const auto z_indices = static_cast<int>(indices.shape()[2]);
+    const auto b_indices = static_cast<int>(indices.shape()[3]);
+    const auto w_dst     = static_cast<int>(dst.shape()[0]);
+    const auto h_dst     = static_cast<int>(dst.shape()[1]);
+    const auto z_dst     = static_cast<int>(dst.shape()[2]);
+    for(int b = 0; b < b_indices; ++b)
+    {
+        for(int r = 0; r < z_indices; ++r)
+        {
+            for(int h = 0; h < h_indices; ++h)
+            {
+                for(int w = 0; w < w_indices; ++w)
+                {
+                    const uint32_t index_into_dst = indices[b * z_indices * h_indices * w_indices + r * h_indices * w_indices + h * w_indices + w];
+                    const auto     input_val      = src[b * z_indices * h_indices * w_indices + r * h_indices * w_indices + h * w_indices + w];
+                    auto          *ptr            = &dst[b * z_dst * h_dst * w_dst];
+                    ptr[index_into_dst]           = input_val;
+                }
+            }
+        }
+    }
+    return dst;
+}
+
+template <>
+SimpleTensor<uint8_t> max_unpooling_layer<uint8_t>(
+    const SimpleTensor<uint8_t> &src, const PoolingLayerInfo &info,
+    const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> &indices,
+    TensorShape output_shape, DataLayout data_layout)
+
+{
+    SimpleTensor<float>   src_tmp = convert_from_asymmetric(src);
+    SimpleTensor<float>   dst_tmp = max_unpooling_layer_internal<float>(src_tmp, info, output_qinfo, indices, output_shape, data_layout);
+    SimpleTensor<uint8_t> dst     = convert_to_asymmetric<uint8_t>(dst_tmp, output_qinfo);
+    return dst;
+}
+
+template <typename T>
+SimpleTensor<T> max_unpooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info,
+                                    const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> &indices,
+                                    TensorShape output_shape, DataLayout data_layout)
+{
+    return max_unpooling_layer_internal<T>(src, info, output_qinfo, indices, output_shape, data_layout);
+}
+
+template SimpleTensor<float> max_unpooling_layer(const SimpleTensor<float> &src, const PoolingLayerInfo &info,
+                                                 const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> &indices,
+                                                 TensorShape output_shape, DataLayout data_layout);
+template SimpleTensor<half> max_unpooling_layer(const SimpleTensor<half> &src, const PoolingLayerInfo &info,
+                                                const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> &indices,
+                                                TensorShape output_shape, DataLayout data_layout);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute

diff --git a/tests/validation/reference/MaxUnpoolingLayer.h b/tests/validation/reference/MaxUnpoolingLayer.h
new file mode 100644
index 0000000..9b00a3a
--- /dev/null
+++ b/tests/validation/reference/MaxUnpoolingLayer.h

@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_MAXUNPOOLING_LAYER_H
+#define ARM_COMPUTE_TEST_MAXUNPOOLING_LAYER_H
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> max_unpooling_layer(const SimpleTensor<T> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> &indices,
+                                    TensorShape output_shape, DataLayout data_layout = DataLayout::NCHW);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_MAXUNPOOLING_LAYER_H */

diff --git a/tests/validation/reference/MeanStdDev.cpp b/tests/validation/reference/MeanStdDev.cpp
index f48fcb1..f6f5cb2 100644
--- a/tests/validation/reference/MeanStdDev.cpp
+++ b/tests/validation/reference/MeanStdDev.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/MeanStdDev.h b/tests/validation/reference/MeanStdDev.h
index 96e04e9..2552147 100644
--- a/tests/validation/reference/MeanStdDev.h
+++ b/tests/validation/reference/MeanStdDev.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/MeanStdDevNormalizationLayer.cpp b/tests/validation/reference/MeanStdDevNormalizationLayer.cpp
index c44c983..0a23fa1 100644
--- a/tests/validation/reference/MeanStdDevNormalizationLayer.cpp
+++ b/tests/validation/reference/MeanStdDevNormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/MeanStdDevNormalizationLayer.h b/tests/validation/reference/MeanStdDevNormalizationLayer.h
index fb2d30b..968cdc7 100644
--- a/tests/validation/reference/MeanStdDevNormalizationLayer.h
+++ b/tests/validation/reference/MeanStdDevNormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Median3x3.cpp b/tests/validation/reference/Median3x3.cpp
index 314bbe3..55f5f62 100644
--- a/tests/validation/reference/Median3x3.cpp
+++ b/tests/validation/reference/Median3x3.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Median3x3.h b/tests/validation/reference/Median3x3.h
index 2778269..a10f428 100644
--- a/tests/validation/reference/Median3x3.h
+++ b/tests/validation/reference/Median3x3.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/MinMaxLocation.cpp b/tests/validation/reference/MinMaxLocation.cpp
index 427adeb..b8771ba 100644
--- a/tests/validation/reference/MinMaxLocation.cpp
+++ b/tests/validation/reference/MinMaxLocation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/MinMaxLocation.h b/tests/validation/reference/MinMaxLocation.h
index 42ededf..d6ba932 100644
--- a/tests/validation/reference/MinMaxLocation.h
+++ b/tests/validation/reference/MinMaxLocation.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/NonLinearFilter.cpp b/tests/validation/reference/NonLinearFilter.cpp
index 72433eb..ada8286 100644
--- a/tests/validation/reference/NonLinearFilter.cpp
+++ b/tests/validation/reference/NonLinearFilter.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/NonLinearFilter.h b/tests/validation/reference/NonLinearFilter.h
index 7f1c4fa..ecf6563 100644
--- a/tests/validation/reference/NonLinearFilter.h
+++ b/tests/validation/reference/NonLinearFilter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/NonMaxSuppression.cpp b/tests/validation/reference/NonMaxSuppression.cpp
index 8fc370b..0c76a3f 100644
--- a/tests/validation/reference/NonMaxSuppression.cpp
+++ b/tests/validation/reference/NonMaxSuppression.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -143,7 +143,7 @@
     const size_t                    output_size       = std::min(static_cast<size_t>(max_output_size), num_boxes);
     const std::vector<CandidateBox> candidates_vector = get_candidates(scores, score_threshold);
     std::vector<int>                selected;
-    for(const auto c : candidates_vector)
+    for(const auto &c : candidates_vector)
     {
         if(selected.size() == output_size)
         {

diff --git a/tests/validation/reference/NonMaxSuppression.h b/tests/validation/reference/NonMaxSuppression.h
index ca63394..d0c9bf7 100644
--- a/tests/validation/reference/NonMaxSuppression.h
+++ b/tests/validation/reference/NonMaxSuppression.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/NonMaximaSuppression.cpp b/tests/validation/reference/NonMaximaSuppression.cpp
index 45ce67f..1132e65 100644
--- a/tests/validation/reference/NonMaximaSuppression.cpp
+++ b/tests/validation/reference/NonMaximaSuppression.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/NonMaximaSuppression.h b/tests/validation/reference/NonMaximaSuppression.h
index f2ac95f..c5f8195 100644
--- a/tests/validation/reference/NonMaximaSuppression.h
+++ b/tests/validation/reference/NonMaximaSuppression.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/NormalizationLayer.cpp b/tests/validation/reference/NormalizationLayer.cpp
index d57e6f1..044a8b3 100644
--- a/tests/validation/reference/NormalizationLayer.cpp
+++ b/tests/validation/reference/NormalizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/NormalizationLayer.h b/tests/validation/reference/NormalizationLayer.h
index ebdf765..9d7bc57 100644
--- a/tests/validation/reference/NormalizationLayer.h
+++ b/tests/validation/reference/NormalizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/NormalizePlanarYUVLayer.cpp b/tests/validation/reference/NormalizePlanarYUVLayer.cpp
index d2d29cc..5e72e4a 100644
--- a/tests/validation/reference/NormalizePlanarYUVLayer.cpp
+++ b/tests/validation/reference/NormalizePlanarYUVLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/NormalizePlanarYUVLayer.h b/tests/validation/reference/NormalizePlanarYUVLayer.h
index f7420e9..1a14e31 100644
--- a/tests/validation/reference/NormalizePlanarYUVLayer.h
+++ b/tests/validation/reference/NormalizePlanarYUVLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/OpticalFlow.cpp b/tests/validation/reference/OpticalFlow.cpp
index da0b9f9..0a04214 100644
--- a/tests/validation/reference/OpticalFlow.cpp
+++ b/tests/validation/reference/OpticalFlow.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/OpticalFlow.h b/tests/validation/reference/OpticalFlow.h
index 42f2abe..1bc367a 100644
--- a/tests/validation/reference/OpticalFlow.h
+++ b/tests/validation/reference/OpticalFlow.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/PadLayer.cpp b/tests/validation/reference/PadLayer.cpp
index 182c16f..f5ba33d 100644
--- a/tests/validation/reference/PadLayer.cpp
+++ b/tests/validation/reference/PadLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/PadLayer.h b/tests/validation/reference/PadLayer.h
index 9b049d4..1441dbe 100644
--- a/tests/validation/reference/PadLayer.h
+++ b/tests/validation/reference/PadLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Permute.cpp b/tests/validation/reference/Permute.cpp
index 36b07dc..6f122b1 100644
--- a/tests/validation/reference/Permute.cpp
+++ b/tests/validation/reference/Permute.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Permute.h b/tests/validation/reference/Permute.h
index 3de22b6..895c08c 100644
--- a/tests/validation/reference/Permute.h
+++ b/tests/validation/reference/Permute.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Phase.cpp b/tests/validation/reference/Phase.cpp
index 7827cd2..228f73b 100644
--- a/tests/validation/reference/Phase.cpp
+++ b/tests/validation/reference/Phase.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Phase.h b/tests/validation/reference/Phase.h
index 93845fc..436c280 100644
--- a/tests/validation/reference/Phase.h
+++ b/tests/validation/reference/Phase.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/PixelWiseMultiplication.cpp b/tests/validation/reference/PixelWiseMultiplication.cpp
index 3e21fca..9f70b1c 100644
--- a/tests/validation/reference/PixelWiseMultiplication.cpp
+++ b/tests/validation/reference/PixelWiseMultiplication.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -178,6 +178,34 @@
 }
 
 template <>
+SimpleTensor<int16_t> pixel_wise_multiplication(const SimpleTensor<uint8_t> &src1, const SimpleTensor<uint8_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
+                                                DataType dt_out, const QuantizationInfo &qout)
+{
+    SimpleTensor<int16_t> dst(TensorShape::broadcast_shape(src1.shape(), src2.shape()), dt_out, 1, qout);
+
+    if(src1.data_type() == DataType::QASYMM8 && src2.data_type() == DataType::QASYMM8)
+    {
+        SimpleTensor<float> src1_tmp = convert_from_asymmetric(src1);
+        SimpleTensor<float> src2_tmp = convert_from_asymmetric(src2);
+        SimpleTensor<float> dst_tmp  = pixel_wise_multiplication<float, float, float>(src1_tmp, src2_tmp, scale, convert_policy, rounding_policy, DataType::F32, qout);
+        dst                          = convert_to_symmetric<int16_t>(dst_tmp, qout);
+    }
+    else
+    {
+        if(scale < 0)
+        {
+            ARM_COMPUTE_ERROR("Scale of pixel-wise multiplication must be non-negative");
+        }
+
+        Coordinates id_src1{};
+        Coordinates id_src2{};
+        Coordinates id_dst{};
+        BroadcastUnroll<Coordinates::num_max_dimensions>::unroll(src1, src2, dst, scale, convert_policy, rounding_policy, id_src1, id_src2, id_dst);
+    }
+    return dst;
+}
+
+template <>
 SimpleTensor<int8_t> pixel_wise_multiplication(const SimpleTensor<int8_t> &src1, const SimpleTensor<int8_t> &src2, float scale, ConvertPolicy convert_policy, RoundingPolicy rounding_policy,
                                                DataType dt_out, const QuantizationInfo &qout)
 {

diff --git a/tests/validation/reference/PixelWiseMultiplication.h b/tests/validation/reference/PixelWiseMultiplication.h
index f8afa03..d61e6c7 100644
--- a/tests/validation/reference/PixelWiseMultiplication.h
+++ b/tests/validation/reference/PixelWiseMultiplication.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/PoolingLayer.cpp b/tests/validation/reference/PoolingLayer.cpp
index 778e28d..5f4edfe 100644
--- a/tests/validation/reference/PoolingLayer.cpp
+++ b/tests/validation/reference/PoolingLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,9 +43,10 @@
     ARM_COMPUTE_ERROR_ON(info.is_global_pooling && (src.shape().x() != src.shape().y()));
     // Create reference
     SimpleTensor<T> dst{ compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info), src.data_type(), 1 };
+    auto            pooled_shape = compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info);
     if(indices)
     {
-        *indices = SimpleTensor<uint32_t> { compute_pool_shape(TensorInfo(src.shape(), 1, src.data_type()), info), DataType::U32, 1 };
+        *indices = SimpleTensor<uint32_t> { pooled_shape, DataType::U32, 1 };
     }
     const int   pool_size_x     = info.is_global_pooling ? src.shape().x() : info.pool_size.width;
     const int   pool_size_y     = info.is_global_pooling ? src.shape().y() : info.pool_size.height;
@@ -58,56 +59,62 @@
     int         pad_bottom      = info.pad_stride_info.pad_bottom();
     bool        exclude_padding = info.exclude_padding;
 
-    const auto w_src      = static_cast<int>(src.shape()[0]);
-    const auto h_src      = static_cast<int>(src.shape()[1]);
-    const int  upper_dims = src.shape().total_size() / (w_src * h_src);
+    const auto w_src = static_cast<int>(src.shape()[0]);
+    const auto h_src = static_cast<int>(src.shape()[1]);
+    const auto z_src = static_cast<int>(src.shape()[2]);
+    const auto b_src = static_cast<int>(src.shape()[3]);
 
-    const auto  w_dst = static_cast<int>(dst.shape()[0]);
-    const auto  h_dst = static_cast<int>(dst.shape()[1]);
+    const int upper_dims = src.shape().total_size() / (w_src * h_src);
+
+    const auto w_dst = static_cast<int>(dst.shape()[0]);
+    const auto h_dst = static_cast<int>(dst.shape()[1]);
+    const auto z_dst = static_cast<int>(dst.shape()[2]);
+
     TensorShape shape_nhwc(src.shape());
     permute(shape_nhwc, PermutationVector(2U, 0U, 1U));
-
     if(type == PoolingType::MAX)
     {
-        for(int r = 0; r < upper_dims; ++r)
+        for(int b = 0; b < b_src; ++b)
         {
-            for(int h = 0; h < h_dst; ++h)
+            for(int r = 0; r < z_src; ++r)
             {
-                for(int w = 0; w < w_dst; ++w)
+                for(int h = 0; h < h_dst; ++h)
                 {
-                    int wstart = w * pool_stride_x - pad_left;
-                    int hstart = h * pool_stride_y - pad_top;
-                    int wend   = std::min(wstart + pool_size_x, w_src);
-                    int hend   = std::min(hstart + pool_size_y, h_src);
-                    wstart     = std::max(wstart, 0);
-                    hstart     = std::max(hstart, 0);
-
-                    auto max_val = std::numeric_limits<ACC_T>::lowest();
-                    int  max_index{ 0 };
-                    for(int y = hstart; y < hend; ++y)
+                    for(int w = 0; w < w_dst; ++w)
                     {
-                        for(int x = wstart; x < wend; ++x)
+                        int wstart   = w * pool_stride_x - pad_left;
+                        int hstart   = h * pool_stride_y - pad_top;
+                        int wend     = std::min(wstart + pool_size_x, w_src);
+                        int hend     = std::min(hstart + pool_size_y, h_src);
+                        wstart       = std::max(wstart, 0);
+                        hstart       = std::max(hstart, 0);
+                        auto max_val = std::numeric_limits<ACC_T>::lowest();
+                        int  max_index{ 0 };
+                        for(int y = hstart; y < hend; ++y)
                         {
-                            const auto val = static_cast<ACC_T>(src[r * h_src * w_src + y * w_src + x]);
-                            if(val > max_val)
+                            for(int x = wstart; x < wend; ++x)
                             {
-                                max_val = val;
-                                if(data_layout == DataLayout::NCHW)
+                                const auto val = static_cast<ACC_T>(src[b * z_src * h_src * w_src + r * h_src * w_src + y * w_src + x]);
+                                if(val > max_val)
                                 {
-                                    max_index = coord2index(src.shape(), Coordinates(x, y, r));
-                                }
-                                else
-                                {
-                                    max_index = coord2index(shape_nhwc, Coordinates(r, x, y));
+                                    max_val = val;
+                                    if(data_layout == DataLayout::NCHW)
+                                    {
+                                        max_index = coord2index(src.shape(), Coordinates(x, y, r, 0));
+                                    }
+                                    else
+                                    {
+                                        max_index = coord2index(shape_nhwc, Coordinates(r, x, y, 0));
+                                    }
                                 }
                             }
                         }
-                    }
 
-                    dst[r * h_dst * w_dst + h * w_dst + w] = static_cast<T>(max_val);
-                    if(indices)
-                    {
-                        (*indices)[r * h_dst * w_dst + h * w_dst + w] = max_index;
+                        dst[b * z_dst * h_dst * w_dst + r * h_dst * w_dst + h * w_dst + w] = static_cast<T>(max_val);
+                        if(indices)
+                        {
+                            (*indices)[b * z_dst * h_dst * w_dst + r * h_dst * w_dst + h * w_dst + w] = max_index;
+                        }
                     }
                 }
             }
@@ -164,7 +171,6 @@
             }
         }
     }
-
     return dst;
 }
 
@@ -209,7 +215,7 @@
         return pooling_layer_internal<half, float>(src, info, indices, data_layout);
     }
 
-    return pooling_layer_internal<half>(src, info, indices);
+    return pooling_layer_internal<half>(src, info, indices, data_layout);
 }
 
 template SimpleTensor<float> pooling_layer(const SimpleTensor<float> &src, const PoolingLayerInfo &info, const QuantizationInfo &output_qinfo, SimpleTensor<uint32_t> *indices, DataLayout data_layout);

diff --git a/tests/validation/reference/PoolingLayer.h b/tests/validation/reference/PoolingLayer.h
index 346f1c0..6ce8c84 100644
--- a/tests/validation/reference/PoolingLayer.h
+++ b/tests/validation/reference/PoolingLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/PriorBoxLayer.cpp b/tests/validation/reference/PriorBoxLayer.cpp
index 0fd4a8a..4745df8 100644
--- a/tests/validation/reference/PriorBoxLayer.cpp
+++ b/tests/validation/reference/PriorBoxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/PriorBoxLayer.h b/tests/validation/reference/PriorBoxLayer.h
index ee9ad05..eea3a3a 100644
--- a/tests/validation/reference/PriorBoxLayer.h
+++ b/tests/validation/reference/PriorBoxLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/QLSTMLayerNormalization.cpp b/tests/validation/reference/QLSTMLayerNormalization.cpp
index dd6517f..44d4f03 100644
--- a/tests/validation/reference/QLSTMLayerNormalization.cpp
+++ b/tests/validation/reference/QLSTMLayerNormalization.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/QLSTMLayerNormalization.h b/tests/validation/reference/QLSTMLayerNormalization.h
index c35aa2a..f051493 100644
--- a/tests/validation/reference/QLSTMLayerNormalization.h
+++ b/tests/validation/reference/QLSTMLayerNormalization.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/QuantizationLayer.cpp b/tests/validation/reference/QuantizationLayer.cpp
index a70523d..2766537 100644
--- a/tests/validation/reference/QuantizationLayer.cpp
+++ b/tests/validation/reference/QuantizationLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/QuantizationLayer.h b/tests/validation/reference/QuantizationLayer.h
index 08501b7..a8f1ac2 100644
--- a/tests/validation/reference/QuantizationLayer.h
+++ b/tests/validation/reference/QuantizationLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ROIAlignLayer.cpp b/tests/validation/reference/ROIAlignLayer.cpp
index 8b2cd21..b75415c 100644
--- a/tests/validation/reference/ROIAlignLayer.cpp
+++ b/tests/validation/reference/ROIAlignLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ROIAlignLayer.h b/tests/validation/reference/ROIAlignLayer.h
index f3fb516..bb4a49a 100644
--- a/tests/validation/reference/ROIAlignLayer.h
+++ b/tests/validation/reference/ROIAlignLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Range.cpp b/tests/validation/reference/Range.cpp
index ad13454..7d7dcbf 100644
--- a/tests/validation/reference/Range.cpp
+++ b/tests/validation/reference/Range.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Range.h b/tests/validation/reference/Range.h
index d0f63c0..e361a32 100644
--- a/tests/validation/reference/Range.h
+++ b/tests/validation/reference/Range.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index 68352cc..5bdd4f7 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h
index d7c77a6..56d37e4 100644
--- a/tests/validation/reference/ReductionOperation.h
+++ b/tests/validation/reference/ReductionOperation.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Remap.cpp b/tests/validation/reference/Remap.cpp
index a7352eb..5c4d3c1 100644
--- a/tests/validation/reference/Remap.cpp
+++ b/tests/validation/reference/Remap.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Remap.h b/tests/validation/reference/Remap.h
index f24a16b..0726f75 100644
--- a/tests/validation/reference/Remap.h
+++ b/tests/validation/reference/Remap.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ReorgLayer.cpp b/tests/validation/reference/ReorgLayer.cpp
index 9f087d0..0508ebe 100644
--- a/tests/validation/reference/ReorgLayer.cpp
+++ b/tests/validation/reference/ReorgLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ReorgLayer.h b/tests/validation/reference/ReorgLayer.h
index 47db7f3..25b5556 100644
--- a/tests/validation/reference/ReorgLayer.h
+++ b/tests/validation/reference/ReorgLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ReshapeLayer.cpp b/tests/validation/reference/ReshapeLayer.cpp
index 85bf3fc..daea001 100644
--- a/tests/validation/reference/ReshapeLayer.cpp
+++ b/tests/validation/reference/ReshapeLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/ReshapeLayer.h b/tests/validation/reference/ReshapeLayer.h
index a68bbb3..26e357f 100644
--- a/tests/validation/reference/ReshapeLayer.h
+++ b/tests/validation/reference/ReshapeLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Reverse.cpp b/tests/validation/reference/Reverse.cpp
index f5630b9..c6c4614 100644
--- a/tests/validation/reference/Reverse.cpp
+++ b/tests/validation/reference/Reverse.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Reverse.h b/tests/validation/reference/Reverse.h
index cc46807..4a28da7 100644
--- a/tests/validation/reference/Reverse.h
+++ b/tests/validation/reference/Reverse.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Scale.cpp b/tests/validation/reference/Scale.cpp
index 73fe21c..aa265c2 100644
--- a/tests/validation/reference/Scale.cpp
+++ b/tests/validation/reference/Scale.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,9 @@
 #include "Scale.h"
 
 #include "Utils.h"
+#include "arm_compute/core/utils/misc/Rounding.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+#include "src/core/utils/ScaleUtils.h"
 
 namespace arm_compute
 {
@@ -42,17 +44,13 @@
     // Add 1 if ceil_policy_scale is true
     const size_t round_value = ceil_policy_scale ? 1U : 0U;
     TensorShape  shape_scaled(in.shape());
-    shape_scaled.set(0, (in.shape()[0] + round_value) * scale_x);
-    shape_scaled.set(1, (in.shape()[1] + round_value) * scale_y);
+    shape_scaled.set(0, (in.shape()[0] + round_value) * scale_x, /* apply_dim_correction = */ false);
+    shape_scaled.set(1, (in.shape()[1] + round_value) * scale_y, /* apply_dim_correction = */ false);
     SimpleTensor<T> out(shape_scaled, in.data_type());
 
-    const auto needs_align_corners = policy == InterpolationPolicy::BILINEAR
-                                     && sampling_policy == SamplingPolicy::TOP_LEFT
-                                     && align_corners;
-
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = arm_compute::calculate_resize_ratio(in.shape()[0], out.shape()[0], needs_align_corners);
-    const auto hr = arm_compute::calculate_resize_ratio(in.shape()[1], out.shape()[1], needs_align_corners);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(in.shape()[0], out.shape()[0], align_corners);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(in.shape()[1], out.shape()[1], align_corners);
 
     const auto width  = static_cast<int>(in.shape().x());
     const auto height = static_cast<int>(in.shape().y());
@@ -82,8 +80,8 @@
                 switch(sampling_policy)
                 {
                     case SamplingPolicy::TOP_LEFT:
-                        x_src = std::floor(idx * wr);
-                        y_src = std::floor(idy * hr);
+                        x_src = align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(idx * wr) : std::floor(idx * wr);
+                        y_src = align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(idy * hr) : std::floor(idy * hr);
                         break;
                     case SamplingPolicy::CENTER:
                         //Calculate the source coords without -0.5f is equivalent to round the x_scr/y_src coords

diff --git a/tests/validation/reference/Scale.h b/tests/validation/reference/Scale.h
index 65cecbb..c66af8d 100644
--- a/tests/validation/reference/Scale.h
+++ b/tests/validation/reference/Scale.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Scharr.cpp b/tests/validation/reference/Scharr.cpp
index 060192b..e9fbb73 100644
--- a/tests/validation/reference/Scharr.cpp
+++ b/tests/validation/reference/Scharr.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Scharr.h b/tests/validation/reference/Scharr.h
index 4a41333..42b3202 100644
--- a/tests/validation/reference/Scharr.h
+++ b/tests/validation/reference/Scharr.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Select.cpp b/tests/validation/reference/Select.cpp
index 91b6b66..f3f9b41 100644
--- a/tests/validation/reference/Select.cpp
+++ b/tests/validation/reference/Select.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Select.h b/tests/validation/reference/Select.h
index e00d5cc..d23ebda 100644
--- a/tests/validation/reference/Select.h
+++ b/tests/validation/reference/Select.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/SliceOperations.cpp b/tests/validation/reference/SliceOperations.cpp
index b34afdc..50c5c68 100644
--- a/tests/validation/reference/SliceOperations.cpp
+++ b/tests/validation/reference/SliceOperations.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/SliceOperations.h b/tests/validation/reference/SliceOperations.h
index ac3b289..0a89325 100644
--- a/tests/validation/reference/SliceOperations.h
+++ b/tests/validation/reference/SliceOperations.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Sobel.cpp b/tests/validation/reference/Sobel.cpp
index 1f35717..d9c2532 100644
--- a/tests/validation/reference/Sobel.cpp
+++ b/tests/validation/reference/Sobel.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Sobel.h b/tests/validation/reference/Sobel.h
index 43f0131..86d6d0b 100644
--- a/tests/validation/reference/Sobel.h
+++ b/tests/validation/reference/Sobel.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/SoftmaxLayer.cpp b/tests/validation/reference/SoftmaxLayer.cpp
index 2fe1fae..0020676 100644
--- a/tests/validation/reference/SoftmaxLayer.cpp
+++ b/tests/validation/reference/SoftmaxLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "SoftmaxLayer.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
@@ -34,32 +35,21 @@
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log)
+SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis, bool is_log)
 {
     // Create reference
     SimpleTensor<T> dst{ src.shape(), src.data_type(), 1 };
 
-    // Negative index is used to specify axis from the end (e.g. -1 for the last axis).
-    if(axis < 0)
-    {
-        axis += src.shape().num_dimensions();
-    }
+    // Convert reduce-before axis (inclusive) to first n axes to reduce
+    const size_t first_n_reduce_axes = dim_index_2_num_dims(reduce_end_axis, static_cast<int32_t>(src.shape().num_dimensions()));
 
     // Compute reference. Lower dims are the collapsing of the first axis
     // dimensions (i.e., the flattened dimension of each batch). The upper dims are
     // instead the batches we want to normalize
 
-    int lower_dims = 1;
-    for(size_t i = 0; i < static_cast<size_t>(axis); ++i)
-    {
-        lower_dims *= src.shape()[i];
-    }
+    const int lower_dims = src.shape().total_size_lower(first_n_reduce_axes);
 
-    int upper_dims = 1;
-    for(size_t i = static_cast<size_t>(axis); i < TensorShape::num_max_dimensions; ++i)
-    {
-        upper_dims *= src.shape()[i];
-    }
+    const int upper_dims = src.shape().total_size_upper(first_n_reduce_axes);
 
 #if defined(_OPENMP)
     #pragma omp parallel for
@@ -95,7 +85,7 @@
         {
             if(is_log)
             {
-                return val - sum;
+                return val - static_cast<T>(std::log(sum));
             }
             else
             {
@@ -107,30 +97,30 @@
     return dst;
 }
 
-template SimpleTensor<float> softmax_layer_generic(const SimpleTensor<float> &src, float beta, int32_t axis, bool is_log);
-template SimpleTensor<half> softmax_layer_generic(const SimpleTensor<half> &src, float beta, int32_t axis, bool is_log);
+template SimpleTensor<float> softmax_layer_generic(const SimpleTensor<float> &src, float beta, int32_t reduce_end_axis, bool is_log);
+template SimpleTensor<half> softmax_layer_generic(const SimpleTensor<half> &src, float beta, int32_t reduce_end_axis, bool is_log);
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis)
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
 {
-    return softmax_layer_generic<T>(src, beta, axis, false);
+    return softmax_layer_generic<T>(src, beta, reduce_end_axis, false);
 }
 
 template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type >
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis)
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis)
 {
     const QuantizationInfo output_quantization_info = arm_compute::get_softmax_output_quantization_info(src.data_type(), false);
 
     SimpleTensor<float> src_tmp = convert_from_asymmetric(src);
-    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta, axis);
+    SimpleTensor<float> dst_tmp = softmax_layer<float>(src_tmp, beta, reduce_end_axis);
     SimpleTensor<T>     dst     = convert_to_asymmetric<T>(dst_tmp, output_quantization_info);
     return dst;
 }
 
-template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta, int32_t axis);
-template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta, int32_t axis);
-template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t axis);
-template SimpleTensor<int8_t> softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t axis);
+template SimpleTensor<float> softmax_layer(const SimpleTensor<float> &src, float beta, int32_t reduce_end_axis);
+template SimpleTensor<half> softmax_layer(const SimpleTensor<half> &src, float beta, int32_t reduce_end_axis);
+template SimpleTensor<uint8_t> softmax_layer(const SimpleTensor<uint8_t> &src, float beta, int32_t reduce_end_axis);
+template SimpleTensor<int8_t> softmax_layer(const SimpleTensor<int8_t> &src, float beta, int32_t reduce_end_axis);
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/SoftmaxLayer.h b/tests/validation/reference/SoftmaxLayer.h
index f819853..2af0b6d 100644
--- a/tests/validation/reference/SoftmaxLayer.h
+++ b/tests/validation/reference/SoftmaxLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,13 +36,13 @@
 namespace reference
 {
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t axis, bool is_log = false);
+SimpleTensor<T> softmax_layer_generic(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis, bool is_log = false);
 
 template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis = -1);
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
 
 template < typename T, typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value, int >::type = 0 >
-SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t axis = -1);
+SimpleTensor<T> softmax_layer(const SimpleTensor<T> &src, float beta, int32_t reduce_end_axis = 0);
 } // namespace reference
 } // namespace validation
 } // namespace test

diff --git a/tests/validation/reference/SpaceToBatch.cpp b/tests/validation/reference/SpaceToBatch.cpp
index 8c25bb7..1d632a3 100644
--- a/tests/validation/reference/SpaceToBatch.cpp
+++ b/tests/validation/reference/SpaceToBatch.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/SpaceToBatch.h b/tests/validation/reference/SpaceToBatch.h
index c4e7797..809be80 100644
--- a/tests/validation/reference/SpaceToBatch.h
+++ b/tests/validation/reference/SpaceToBatch.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/SpaceToDepth.cpp b/tests/validation/reference/SpaceToDepth.cpp
index bd8e37a8b..da911ae 100644
--- a/tests/validation/reference/SpaceToDepth.cpp
+++ b/tests/validation/reference/SpaceToDepth.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/SpaceToDepth.h b/tests/validation/reference/SpaceToDepth.h
index faead46..e2ea1b7 100644
--- a/tests/validation/reference/SpaceToDepth.h
+++ b/tests/validation/reference/SpaceToDepth.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/StackLayer.cpp b/tests/validation/reference/StackLayer.cpp
index 9e9e434..5873101 100644
--- a/tests/validation/reference/StackLayer.cpp
+++ b/tests/validation/reference/StackLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/StackLayer.h b/tests/validation/reference/StackLayer.h
index 301d59a..6a7181e 100644
--- a/tests/validation/reference/StackLayer.h
+++ b/tests/validation/reference/StackLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/TableLookup.cpp b/tests/validation/reference/TableLookup.cpp
index 7f105d9..1cd14b5 100644
--- a/tests/validation/reference/TableLookup.cpp
+++ b/tests/validation/reference/TableLookup.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/TableLookup.h b/tests/validation/reference/TableLookup.h
index 3431343..d765a14 100644
--- a/tests/validation/reference/TableLookup.h
+++ b/tests/validation/reference/TableLookup.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Threshold.cpp b/tests/validation/reference/Threshold.cpp
index d0ef31d..6bc6cf0 100644
--- a/tests/validation/reference/Threshold.cpp
+++ b/tests/validation/reference/Threshold.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Threshold.h b/tests/validation/reference/Threshold.h
index b8b58ff..bee9531 100644
--- a/tests/validation/reference/Threshold.h
+++ b/tests/validation/reference/Threshold.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Tile.cpp b/tests/validation/reference/Tile.cpp
index 694f645..8b5a8d1 100644
--- a/tests/validation/reference/Tile.cpp
+++ b/tests/validation/reference/Tile.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Tile.h b/tests/validation/reference/Tile.h
index 36de966..d378369 100644
--- a/tests/validation/reference/Tile.h
+++ b/tests/validation/reference/Tile.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Transpose.cpp b/tests/validation/reference/Transpose.cpp
index a8c0e95..02c6d86 100644
--- a/tests/validation/reference/Transpose.cpp
+++ b/tests/validation/reference/Transpose.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Transpose.h b/tests/validation/reference/Transpose.h
index aedd494..bbe1c35 100644
--- a/tests/validation/reference/Transpose.h
+++ b/tests/validation/reference/Transpose.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Unstack.cpp b/tests/validation/reference/Unstack.cpp
index 3474c15..02a9c7a 100644
--- a/tests/validation/reference/Unstack.cpp
+++ b/tests/validation/reference/Unstack.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Unstack.h b/tests/validation/reference/Unstack.h
index a9e48d7..88f8a66 100644
--- a/tests/validation/reference/Unstack.h
+++ b/tests/validation/reference/Unstack.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/UpsampleLayer.cpp b/tests/validation/reference/UpsampleLayer.cpp
index a81a601..4e06ad4 100644
--- a/tests/validation/reference/UpsampleLayer.cpp
+++ b/tests/validation/reference/UpsampleLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/UpsampleLayer.h b/tests/validation/reference/UpsampleLayer.h
index 1aad44c..b1d8398 100644
--- a/tests/validation/reference/UpsampleLayer.h
+++ b/tests/validation/reference/UpsampleLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Utils.cpp b/tests/validation/reference/Utils.cpp
index 7a9bfee..844103f 100644
--- a/tests/validation/reference/Utils.cpp
+++ b/tests/validation/reference/Utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Utils.h b/tests/validation/reference/Utils.h
index f7699b1..17ef235 100644
--- a/tests/validation/reference/Utils.h
+++ b/tests/validation/reference/Utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/UtilsQuantizedAsymm.h b/tests/validation/reference/UtilsQuantizedAsymm.h
index 444696c..91d0abf 100644
--- a/tests/validation/reference/UtilsQuantizedAsymm.h
+++ b/tests/validation/reference/UtilsQuantizedAsymm.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/WarpAffine.cpp b/tests/validation/reference/WarpAffine.cpp
index 2a7aeb7..3580b75 100644
--- a/tests/validation/reference/WarpAffine.cpp
+++ b/tests/validation/reference/WarpAffine.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/WarpAffine.h b/tests/validation/reference/WarpAffine.h
index 3d98003..90f765c 100644
--- a/tests/validation/reference/WarpAffine.h
+++ b/tests/validation/reference/WarpAffine.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/WarpPerspective.cpp b/tests/validation/reference/WarpPerspective.cpp
index dc7420b..e35d75e 100644
--- a/tests/validation/reference/WarpPerspective.cpp
+++ b/tests/validation/reference/WarpPerspective.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/WarpPerspective.h b/tests/validation/reference/WarpPerspective.h
index b919d62..7fcd5dd 100644
--- a/tests/validation/reference/WarpPerspective.h
+++ b/tests/validation/reference/WarpPerspective.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/WeightsReshape.cpp b/tests/validation/reference/WeightsReshape.cpp
index fc02395..bb6caea 100644
--- a/tests/validation/reference/WeightsReshape.cpp
+++ b/tests/validation/reference/WeightsReshape.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/WeightsReshape.h b/tests/validation/reference/WeightsReshape.h
index d7999b5..7a6ce4b 100644
--- a/tests/validation/reference/WeightsReshape.h
+++ b/tests/validation/reference/WeightsReshape.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Winograd.cpp b/tests/validation/reference/Winograd.cpp
index 61ba510..64cd9e6 100644
--- a/tests/validation/reference/Winograd.cpp
+++ b/tests/validation/reference/Winograd.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/Winograd.h b/tests/validation/reference/Winograd.h
index 99e7a9a..8e28aa3 100644
--- a/tests/validation/reference/Winograd.h
+++ b/tests/validation/reference/Winograd.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/YOLOLayer.cpp b/tests/validation/reference/YOLOLayer.cpp
index 92bbf54..fbc81f1 100644
--- a/tests/validation/reference/YOLOLayer.cpp
+++ b/tests/validation/reference/YOLOLayer.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/tests/validation/reference/YOLOLayer.h b/tests/validation/reference/YOLOLayer.h
index 6347845..33cf630 100644
--- a/tests/validation/reference/YOLOLayer.h
+++ b/tests/validation/reference/YOLOLayer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/CommonGraphOptions.cpp b/utils/CommonGraphOptions.cpp
index fa9106c..bcfb865 100644
--- a/utils/CommonGraphOptions.cpp
+++ b/utils/CommonGraphOptions.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/CommonGraphOptions.h b/utils/CommonGraphOptions.h
index 3666462..ab7125e 100644
--- a/utils/CommonGraphOptions.h
+++ b/utils/CommonGraphOptions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/GraphUtils.cpp b/utils/GraphUtils.cpp
index 71bfc37..84f0416 100644
--- a/utils/GraphUtils.cpp
+++ b/utils/GraphUtils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/GraphUtils.h b/utils/GraphUtils.h
index d6bae3e..9ab9e54 100644
--- a/utils/GraphUtils.h
+++ b/utils/GraphUtils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/ImageLoader.h b/utils/ImageLoader.h
index 497320e..2dbb6f9 100644
--- a/utils/ImageLoader.h
+++ b/utils/ImageLoader.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 55e34c8..372d4e7 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -2292,6 +2292,12 @@
         case CPUModel::A55r1:
             os << "A55r1";
             break;
+        case CPUModel::A73:
+            os << "A73";
+            break;
+        case CPUModel::X1:
+            os << "X1";
+            break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }

diff --git a/utils/Utils.cpp b/utils/Utils.cpp
index 70c2004..754e7d0 100644
--- a/utils/Utils.cpp
+++ b/utils/Utils.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/Utils.h b/utils/Utils.h
index 23846cf..c5db56d 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/command_line/CommandLineOptions.h b/utils/command_line/CommandLineOptions.h
index 8f82815..99fa1ad 100644
--- a/utils/command_line/CommandLineOptions.h
+++ b/utils/command_line/CommandLineOptions.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/command_line/CommandLineParser.h b/utils/command_line/CommandLineParser.h
index 1aa952d..5881723 100644
--- a/utils/command_line/CommandLineParser.h
+++ b/utils/command_line/CommandLineParser.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/command_line/EnumListOption.h b/utils/command_line/EnumListOption.h
index 834becb..f4ee283 100644
--- a/utils/command_line/EnumListOption.h
+++ b/utils/command_line/EnumListOption.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/command_line/EnumOption.h b/utils/command_line/EnumOption.h
index b775db2..6bcfe5f 100644
--- a/utils/command_line/EnumOption.h
+++ b/utils/command_line/EnumOption.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/command_line/ListOption.h b/utils/command_line/ListOption.h
index 209a85d..b290191 100644
--- a/utils/command_line/ListOption.h
+++ b/utils/command_line/ListOption.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/command_line/Option.h b/utils/command_line/Option.h
index b9469a5..c845e54 100644
--- a/utils/command_line/Option.h
+++ b/utils/command_line/Option.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/command_line/SimpleOption.h b/utils/command_line/SimpleOption.h
index 5437592..d767973 100644
--- a/utils/command_line/SimpleOption.h
+++ b/utils/command_line/SimpleOption.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *

diff --git a/utils/command_line/ToggleOption.h b/utils/command_line/ToggleOption.h
index b1d2a32..d3c6866 100644
--- a/utils/command_line/ToggleOption.h
+++ b/utils/command_line/ToggleOption.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2018 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
commit	18b685f5d09ba07aa54e050f881a0befe3e36047	[log] [tgz]
author	Jenkins <bsgcomp@arm.com>	Fri Aug 21 10:26:22 2020 +0100
committer	Jenkins <bsgcomp@arm.com>	Fri Aug 21 10:26:22 2020 +0100
tree	63e58eb2b62c1ebc77c336ba0043518d6f6daa8f
parent	6a7771e460abeac7d401d6d38a0fcf0a0d2c3cbe [diff]