arm_compute v20.05
diff --git a/Android.bp b/Android.bp
index e7ad651..59fb270 100644
--- a/Android.bp
+++ b/Android.bp
@@ -51,6 +51,7 @@
         "src/core/AccessWindowAutoPadding.cpp",
         "src/core/AccessWindowStatic.cpp",
         "src/core/AccessWindowTranspose.cpp",
+        "src/core/CL/CLCompileContext.cpp",
         "src/core/CL/CLCoreRuntimeContext.cpp",
         "src/core/CL/CLHelpers.cpp",
         "src/core/CL/CLKernelLibrary.cpp",
@@ -67,8 +68,12 @@
         "src/core/CL/OpenCL.cpp",
         "src/core/CL/gemm/CLGEMMHelpers.cpp",
         "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp",
+        "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp",
+        "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp",
         "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp",
+        "src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp",
         "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp",
+        "src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp",
         "src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp",
         "src/core/CL/kernels/CLAccumulateKernel.cpp",
         "src/core/CL/kernels/CLActivationLayerKernel.cpp",
@@ -117,17 +122,16 @@
         "src/core/CL/kernels/CLFlattenLayerKernel.cpp",
         "src/core/CL/kernels/CLFloorKernel.cpp",
         "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp",
+        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
+        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp",
-        "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp",
         "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp",
         "src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp",
         "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp",
@@ -169,6 +173,7 @@
         "src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp",
         "src/core/CL/kernels/CLPoolingLayerKernel.cpp",
         "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp",
+        "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp",
         "src/core/CL/kernels/CLQuantizationLayerKernel.cpp",
         "src/core/CL/kernels/CLROIAlignLayerKernel.cpp",
         "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp",
@@ -209,7 +214,6 @@
         "src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp",
         "src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp",
         "src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp",
-        "src/core/CPP/kernels/CPPFlipWeightsKernel.cpp",
         "src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp",
         "src/core/CPP/kernels/CPPPermuteKernel.cpp",
         "src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp",
@@ -278,10 +282,10 @@
         "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp",
         "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp",
         "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp",
+        "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp",
         "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp",
         "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp",
         "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp",
-        "src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp",
         "src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp",
         "src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp",
         "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp",
@@ -319,6 +323,7 @@
         "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp",
         "src/core/NEON/kernels/NEPoolingLayerKernel.cpp",
         "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp",
+        "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp",
         "src/core/NEON/kernels/NEQuantizationLayerKernel.cpp",
         "src/core/NEON/kernels/NEROIAlignLayerKernel.cpp",
         "src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp",
@@ -349,6 +354,7 @@
         "src/core/NEON/kernels/NEWidthConcatenateLayerKernel.cpp",
         "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp",
         "src/core/NEON/kernels/NEYOLOLayerKernel.cpp",
+        "src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp",
         "src/core/NEON/kernels/arm_gemm/gemm_int16.cpp",
@@ -381,26 +387,32 @@
         "src/core/NEON/kernels/convolution/winograd/padding.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp",
+        "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp16_fp16_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_4x4_fp32_fp32_integers.cpp",
+        "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp16_fp16_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_6x6_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2_7_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_3x3_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_2x2_5x5_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4_5_fp32_fp32_integers.cpp",
+        "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp16_fp16_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_4x4_3x3_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/output_6_3_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2_7_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_3x3_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_2x2_5x5_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4_5_fp32_fp32_integers.cpp",
+        "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp16_fp16_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_4x4_3x3_fp32_fp32_integers.cpp",
         "src/core/NEON/kernels/convolution/winograd/winograd_transforms/weights_6_3_fp32_fp32_integers.cpp",
         "src/core/PyramidInfo.cpp",
         "src/core/Rounding.cpp",
+        "src/core/Size2D.cpp",
         "src/core/SubTensorInfo.cpp",
         "src/core/TensorInfo.cpp",
         "src/core/Utils.cpp",
         "src/core/Validate.cpp",
+        "src/core/Version.cpp",
         "src/core/utils/helpers/fft.cpp",
         "src/core/utils/helpers/tensor_transform.cpp",
         "src/core/utils/io/FileHandler.cpp",
@@ -431,6 +443,7 @@
         "src/runtime/CL/CLTensorAllocator.cpp",
         "src/runtime/CL/CLTuner.cpp",
         "src/runtime/CL/ICLSimpleFunction.cpp",
+        "src/runtime/CL/Utils.cpp",
         "src/runtime/CL/functions/CLAbsoluteDifference.cpp",
         "src/runtime/CL/functions/CLAccumulate.cpp",
         "src/runtime/CL/functions/CLActivationLayer.cpp",
@@ -522,6 +535,7 @@
         "src/runtime/CL/functions/CLPixelWiseMultiplication.cpp",
         "src/runtime/CL/functions/CLPoolingLayer.cpp",
         "src/runtime/CL/functions/CLPriorBoxLayer.cpp",
+        "src/runtime/CL/functions/CLQLSTMLayer.cpp",
         "src/runtime/CL/functions/CLQuantizationLayer.cpp",
         "src/runtime/CL/functions/CLRNNLayer.cpp",
         "src/runtime/CL/functions/CLROIAlignLayer.cpp",
@@ -557,6 +571,9 @@
         "src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp",
         "src/runtime/CL/functions/CLWinogradInputTransform.cpp",
         "src/runtime/CL/functions/CLYOLOLayer.cpp",
+        "src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp",
+        "src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp",
+        "src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp",
         "src/runtime/CL/tuners/BifrostTuner.cpp",
         "src/runtime/CL/tuners/CLLWSList.cpp",
         "src/runtime/CL/tuners/MidgardTuner.cpp",
@@ -683,6 +700,7 @@
         "src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp",
         "src/runtime/NEON/functions/NEPoolingLayer.cpp",
         "src/runtime/NEON/functions/NEPriorBoxLayer.cpp",
+        "src/runtime/NEON/functions/NEQLSTMLayer.cpp",
         "src/runtime/NEON/functions/NEQuantizationLayer.cpp",
         "src/runtime/NEON/functions/NERNNLayer.cpp",
         "src/runtime/NEON/functions/NEROIAlignLayer.cpp",
@@ -764,6 +782,8 @@
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_native_fp32_mla_16x4/generic.cpp",
@@ -783,20 +803,28 @@
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_native_bf16fp32_dot_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp16_mla_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_native_fp32_mla_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_native_s8s32_dot_4VLx4/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_native_u8u32_dot_4VLx4/generic.cpp",
+                "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp",
                 "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp",