AArch64/ARM64: move ARM64 into AArch64's place

This commit starts with a "git mv ARM64 AArch64" and continues out
from there, renaming the C++ classes, intrinsics, and other
target-local objects for consistency.

"ARM64" test directories are also moved, and tests that began their
life in ARM64 use an arm64 triple, those from AArch64 use an aarch64
triple. Both should be equivalent though.

This finishes the AArch64 merge, and everyone should feel free to
continue committing as normal now.

llvm-svn: 209577
diff --git a/llvm/test/CodeGen/AArch64/128bit_load_store.ll b/llvm/test/CodeGen/AArch64/128bit_load_store.ll
index 56f6787..a6f0776 100644
--- a/llvm/test/CodeGen/AArch64/128bit_load_store.ll
+++ b/llvm/test/CodeGen/AArch64/128bit_load_store.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s --check-prefix=CHECK
 
 define void @test_store_f128(fp128* %ptr, fp128 %val) #0 {
 ; CHECK-LABEL: test_store_f128
@@ -17,8 +17,8 @@
 }
 
 define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
-; CHECK-ARM64-LABEL: test_vstrq_p128
-; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-LABEL: test_vstrq_p128
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
 
 entry:
   %0 = bitcast i128* %ptr to fp128*
@@ -28,8 +28,8 @@
 }
 
 define i128 @test_vldrq_p128(i128* readonly %ptr) #2 {
-; CHECK-ARM64-LABEL: test_vldrq_p128
-; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
+; CHECK-LABEL: test_vldrq_p128
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [{{x[0-9]+}}]
 
 entry:
   %0 = bitcast i128* %ptr to fp128*
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-v1i1-setcc.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-v1i1-setcc.ll
rename to llvm/test/CodeGen/AArch64/aarch64-neon-v1i1-setcc.ll
diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll
index 3aa427c..b85fdbb 100644
--- a/llvm/test/CodeGen/AArch64/addsub.ll
+++ b/llvm/test/CodeGen/AArch64/addsub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 ; Note that this should be refactored (for efficiency if nothing else)
 ; when the PCS is implemented so we don't have to worry about the
diff --git a/llvm/test/CodeGen/AArch64/addsub_ext.ll b/llvm/test/CodeGen/AArch64/addsub_ext.ll
index cd01f59..a2266b1 100644
--- a/llvm/test/CodeGen/AArch64/addsub_ext.ll
+++ b/llvm/test/CodeGen/AArch64/addsub_ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs %s -o - -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
diff --git a/llvm/test/CodeGen/AArch64/alloca.ll b/llvm/test/CodeGen/AArch64/alloca.ll
index 7cab200..f93efbc 100644
--- a/llvm/test/CodeGen/AArch64/alloca.ll
+++ b/llvm/test/CodeGen/AArch64/alloca.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP-ARM64 %s
 
 declare void @use_addr(i8*)
 
@@ -53,7 +53,7 @@
 
   %val = load i64* %loc
 
-; CHECK-ARM64: ldur x0, [x29, #-[[LOC_FROM_FP]]]
+; CHECK: ldur x0, [x29, #-[[LOC_FROM_FP]]]
 
   ret i64 %val
   ; Make sure epilogue restores sp from fp
@@ -74,16 +74,16 @@
 ; CHECK-NOFP-AARCH64: add     x8, [[TMP]], #0
 
 
-; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov     x29, sp
-; CHECK-ARM64: sub     sp, sp, #192
-; CHECK-ARM64: stp     q6, q7, [x29, #-96]
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
+; CHECK: sub     sp, sp, #192
+; CHECK: stp     q6, q7, [x29, #-96]
 ; [...]
-; CHECK-ARM64: stp     q0, q1, [x29, #-192]
+; CHECK: stp     q0, q1, [x29, #-192]
 
-; CHECK-ARM64: stp     x6, x7, [x29, #-16]
+; CHECK: stp     x6, x7, [x29, #-16]
 ; [...]
-; CHECK-ARM64: stp     x2, x3, [x29, #-48]
+; CHECK: stp     x2, x3, [x29, #-48]
 
 ; CHECK-NOFP-ARM64: stp     x29, x30, [sp, #-16]!
 ; CHECK-NOFP-ARM64: mov     x29, sp
@@ -115,11 +115,11 @@
 ; CHECK-LABEL: test_alloca_large_frame:
 
 
-; CHECK-ARM64: stp     x20, x19, [sp, #-32]!
-; CHECK-ARM64: stp     x29, x30, [sp, #16]
-; CHECK-ARM64: add     x29, sp, #16
-; CHECK-ARM64: sub     sp, sp, #1953, lsl #12
-; CHECK-ARM64: sub     sp, sp, #512
+; CHECK: stp     x20, x19, [sp, #-32]!
+; CHECK: stp     x29, x30, [sp, #16]
+; CHECK: add     x29, sp, #16
+; CHECK: sub     sp, sp, #1953, lsl #12
+; CHECK: sub     sp, sp, #512
 
   %addr1 = alloca i8, i64 %n
   %addr2 = alloca i64, i64 1000000
@@ -128,9 +128,9 @@
 
   ret void
 
-; CHECK-ARM64: sub     sp, x29, #16
-; CHECK-ARM64: ldp     x29, x30, [sp, #16]
-; CHECK-ARM64: ldp     x20, x19, [sp], #32
+; CHECK: sub     sp, x29, #16
+; CHECK: ldp     x29, x30, [sp, #16]
+; CHECK: ldp     x20, x19, [sp], #32
 }
 
 declare i8* @llvm.stacksave()
diff --git a/llvm/test/CodeGen/AArch64/analyze-branch.ll b/llvm/test/CodeGen/AArch64/analyze-branch.ll
index 1d4daec..6616b27 100644
--- a/llvm/test/CodeGen/AArch64/analyze-branch.ll
+++ b/llvm/test/CodeGen/AArch64/analyze-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 
 ; This test checks that LLVM can do basic stripping and reapplying of branches
 ; to basic blocks.
diff --git a/llvm/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll b/llvm/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
rename to llvm/test/CodeGen/AArch64/arm64-2011-03-09-CPSRSpill.ll
diff --git a/llvm/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll b/llvm/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
rename to llvm/test/CodeGen/AArch64/arm64-2011-03-17-AsmPrinterCrash.ll
diff --git a/llvm/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll b/llvm/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
rename to llvm/test/CodeGen/AArch64/arm64-2011-03-21-Unaligned-Frame-Index.ll
diff --git a/llvm/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll b/llvm/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
rename to llvm/test/CodeGen/AArch64/arm64-2011-04-21-CPSRBug.ll
diff --git a/llvm/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll b/llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
rename to llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll
diff --git a/llvm/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll b/llvm/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
rename to llvm/test/CodeGen/AArch64/arm64-2012-01-11-ComparisonDAGCrash.ll
diff --git a/llvm/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll b/llvm/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
rename to llvm/test/CodeGen/AArch64/arm64-2012-05-07-DAGCombineVectorExtract.ll
diff --git a/llvm/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll b/llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
rename to llvm/test/CodeGen/AArch64/arm64-2012-05-07-MemcpyAlignBug.ll
diff --git a/llvm/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll b/llvm/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
rename to llvm/test/CodeGen/AArch64/arm64-2012-05-09-LOADgot-bug.ll
diff --git a/llvm/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll b/llvm/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
rename to llvm/test/CodeGen/AArch64/arm64-2012-05-22-LdStOptBug.ll
diff --git a/llvm/test/CodeGen/ARM64/2012-06-06-FPToUI.ll b/llvm/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
rename to llvm/test/CodeGen/AArch64/arm64-2012-06-06-FPToUI.ll
diff --git a/llvm/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll b/llvm/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
rename to llvm/test/CodeGen/AArch64/arm64-2012-07-11-InstrEmitterBug.ll
diff --git a/llvm/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll b/llvm/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
similarity index 71%
rename from llvm/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
rename to llvm/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
index b40a581..e2c43d9 100644
--- a/llvm/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-2013-01-13-ffast-fcmp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios7.0.0"
diff --git a/llvm/test/CodeGen/ARM64/2013-01-23-frem-crash.ll b/llvm/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
rename to llvm/test/CodeGen/AArch64/arm64-2013-01-23-frem-crash.ll
diff --git a/llvm/test/CodeGen/ARM64/2013-01-23-sext-crash.ll b/llvm/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
rename to llvm/test/CodeGen/AArch64/arm64-2013-01-23-sext-crash.ll
diff --git a/llvm/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll b/llvm/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
similarity index 82%
rename from llvm/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
rename to llvm/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
index 70e745f..a350ba1 100644
--- a/llvm/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-2013-02-12-shufv8i8.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple
 
 ;CHECK-LABEL: Shuff:
 ;CHECK: tbl.8b
diff --git a/llvm/test/CodeGen/ARM64/2014-04-16-AnInfiniteLoopInDAGCombine.ll b/llvm/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/2014-04-16-AnInfiniteLoopInDAGCombine.ll
rename to llvm/test/CodeGen/AArch64/arm64-2014-04-16-AnInfiniteLoopInDAGCombine.ll
diff --git a/llvm/test/CodeGen/ARM64/2014-04-28-sqshl-uqshl-i64Contant.ll b/llvm/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
similarity index 63%
rename from llvm/test/CodeGen/ARM64/2014-04-28-sqshl-uqshl-i64Contant.ll
rename to llvm/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
index a4c9cd8..3949b85 100644
--- a/llvm/test/CodeGen/ARM64/2014-04-28-sqshl-uqshl-i64Contant.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-2014-04-28-sqshl-uqshl-i64Contant.ll
@@ -4,16 +4,16 @@
 define i64 @test_vqshld_s64_i(i64 %a) {
 ; CHECK-LABEL: test_vqshld_s64_i:
 ; CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
-  %1 = tail call i64 @llvm.arm64.neon.sqshl.i64(i64 %a, i64 36)
+  %1 = tail call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 36)
   ret i64 %1
 }
 
 define i64 @test_vqshld_u64_i(i64 %a) {
 ; CHECK-LABEL: test_vqshld_u64_i:
 ; CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
-  %1 = tail call i64 @llvm.arm64.neon.uqshl.i64(i64 %a, i64 36)
+  %1 = tail call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 36)
   ret i64 %1
 }
 
-declare i64 @llvm.arm64.neon.uqshl.i64(i64, i64)
-declare i64 @llvm.arm64.neon.sqshl.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.uqshl.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqshl.i64(i64, i64)
diff --git a/llvm/test/CodeGen/ARM64/2014-04-29-EXT-undef-mask.ll b/llvm/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
similarity index 93%
rename from llvm/test/CodeGen/ARM64/2014-04-29-EXT-undef-mask.ll
rename to llvm/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
index b0ab9fd..1b2d543 100644
--- a/llvm/test/CodeGen/ARM64/2014-04-29-EXT-undef-mask.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-2014-04-29-EXT-undef-mask.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; The following 2 test cases test shufflevector with beginning UNDEF mask.
 define <8 x i16> @test_vext_undef_traverse(<8 x i16> %in) {
diff --git a/llvm/test/CodeGen/ARM64/AdvSIMD-Scalar.ll b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
similarity index 89%
rename from llvm/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
rename to llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
index 3e75eed..c4597d5 100644
--- a/llvm/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-AdvSIMD-Scalar.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=generic -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=generic -aarch64-simd-scalar=true -asm-verbose=false | FileCheck %s -check-prefix=GENERIC
 
 define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
 ; CHECK-LABEL: bar:
diff --git a/llvm/test/CodeGen/ARM64/aapcs.ll b/llvm/test/CodeGen/AArch64/arm64-aapcs.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/aapcs.ll
rename to llvm/test/CodeGen/AArch64/arm64-aapcs.ll
diff --git a/llvm/test/CodeGen/ARM64/abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/abi-varargs.ll
rename to llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
diff --git a/llvm/test/CodeGen/ARM64/abi.ll b/llvm/test/CodeGen/AArch64/arm64-abi.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/abi.ll
rename to llvm/test/CodeGen/AArch64/arm64-abi.ll
diff --git a/llvm/test/CodeGen/ARM64/abi_align.ll b/llvm/test/CodeGen/AArch64/arm64-abi_align.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/abi_align.ll
rename to llvm/test/CodeGen/AArch64/arm64-abi_align.ll
diff --git a/llvm/test/CodeGen/ARM64/addp.ll b/llvm/test/CodeGen/AArch64/arm64-addp.ll
similarity index 90%
rename from llvm/test/CodeGen/ARM64/addp.ll
rename to llvm/test/CodeGen/AArch64/arm64-addp.ll
index 428f6d4..3f1e5c5 100644
--- a/llvm/test/CodeGen/ARM64/addp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-addp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
 
 define double @foo(<2 x double> %a) nounwind {
 ; CHECK-LABEL: foo:
diff --git a/llvm/test/CodeGen/ARM64/addr-mode-folding.ll b/llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/addr-mode-folding.ll
rename to llvm/test/CodeGen/AArch64/arm64-addr-mode-folding.ll
diff --git a/llvm/test/CodeGen/ARM64/addr-type-promotion.ll b/llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/addr-type-promotion.ll
rename to llvm/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
diff --git a/llvm/test/CodeGen/ARM64/addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/addrmode.ll
rename to llvm/test/CodeGen/AArch64/arm64-addrmode.ll
diff --git a/llvm/test/CodeGen/ARM64/alloc-no-stack-realign.ll b/llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/alloc-no-stack-realign.ll
rename to llvm/test/CodeGen/AArch64/arm64-alloc-no-stack-realign.ll
diff --git a/llvm/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll b/llvm/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
rename to llvm/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
diff --git a/llvm/test/CodeGen/ARM64/andCmpBrToTBZ.ll b/llvm/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/andCmpBrToTBZ.ll
rename to llvm/test/CodeGen/AArch64/arm64-andCmpBrToTBZ.ll
diff --git a/llvm/test/CodeGen/ARM64/ands-bad-peephole.ll b/llvm/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/ands-bad-peephole.ll
rename to llvm/test/CodeGen/AArch64/arm64-ands-bad-peephole.ll
diff --git a/llvm/test/CodeGen/ARM64/anyregcc-crash.ll b/llvm/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/anyregcc-crash.ll
rename to llvm/test/CodeGen/AArch64/arm64-anyregcc-crash.ll
diff --git a/llvm/test/CodeGen/ARM64/anyregcc.ll b/llvm/test/CodeGen/AArch64/arm64-anyregcc.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/anyregcc.ll
rename to llvm/test/CodeGen/AArch64/arm64-anyregcc.ll
diff --git a/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll b/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll
new file mode 100644
index 0000000..78cd1fc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-arith-saturating.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qadds:
+; CHECK: sqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.aarch64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qaddd:
+; CHECK: sqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.aarch64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqadds:
+; CHECK: uqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.aarch64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqaddd:
+; CHECK: uqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.aarch64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+declare i64 @llvm.aarch64.neon.uqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32) nounwind readnone
+
+define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubs:
+; CHECK: sqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.aarch64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubd:
+; CHECK: sqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.aarch64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubs:
+; CHECK: uqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.aarch64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubd:
+; CHECK: uqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.aarch64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+declare i64 @llvm.aarch64.neon.uqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.uqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32) nounwind readnone
+
+define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qabss:
+; CHECK: sqabs s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqabs.i = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %vecext) nounwind
+  ret i32 %vqabs.i
+}
+
+define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qabsd:
+; CHECK: sqabs d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqabs.i = tail call i64 @llvm.aarch64.neon.sqabs.i64(i64 %vecext) nounwind
+  ret i64 %vqabs.i
+}
+
+define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qnegs:
+; CHECK: sqneg s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqneg.i = tail call i32 @llvm.aarch64.neon.sqneg.i32(i32 %vecext) nounwind
+  ret i32 %vqneg.i
+}
+
+define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qnegd:
+; CHECK: sqneg d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqneg.i = tail call i64 @llvm.aarch64.neon.sqneg.i64(i64 %vecext) nounwind
+  ret i64 %vqneg.i
+}
+
+declare i64 @llvm.aarch64.neon.sqneg.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqneg.i32(i32) nounwind readnone
+declare i64 @llvm.aarch64.neon.sqabs.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) nounwind readnone
+
+
+define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovund:
+; CHECK: sqxtun s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovun.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovun.i
+}
+
+define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_s:
+; CHECK: sqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_u:
+; CHECK: uqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+declare i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/arith.ll b/llvm/test/CodeGen/AArch64/arm64-arith.ll
similarity index 90%
rename from llvm/test/CodeGen/ARM64/arith.ll
rename to llvm/test/CodeGen/AArch64/arm64-arith.ll
index e4be8e9..ed9b569 100644
--- a/llvm/test/CodeGen/ARM64/arith.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-arith.ll
@@ -168,7 +168,7 @@
 ; CHECK-LABEL: t18:
 ; CHECK: sdiv w0, w0, w1
 ; CHECK: ret
-  %sdiv = call i32 @llvm.arm64.sdiv.i32(i32 %a, i32 %b)
+  %sdiv = call i32 @llvm.aarch64.sdiv.i32(i32 %a, i32 %b)
   ret i32 %sdiv
 }
 
@@ -177,7 +177,7 @@
 ; CHECK-LABEL: t19:
 ; CHECK: sdiv x0, x0, x1
 ; CHECK: ret
-  %sdiv = call i64 @llvm.arm64.sdiv.i64(i64 %a, i64 %b)
+  %sdiv = call i64 @llvm.aarch64.sdiv.i64(i64 %a, i64 %b)
   ret i64 %sdiv
 }
 
@@ -186,7 +186,7 @@
 ; CHECK-LABEL: t20:
 ; CHECK: udiv w0, w0, w1
 ; CHECK: ret
-  %udiv = call i32 @llvm.arm64.udiv.i32(i32 %a, i32 %b)
+  %udiv = call i32 @llvm.aarch64.udiv.i32(i32 %a, i32 %b)
   ret i32 %udiv
 }
 
@@ -195,14 +195,14 @@
 ; CHECK-LABEL: t21:
 ; CHECK: udiv x0, x0, x1
 ; CHECK: ret
-  %udiv = call i64 @llvm.arm64.udiv.i64(i64 %a, i64 %b)
+  %udiv = call i64 @llvm.aarch64.udiv.i64(i64 %a, i64 %b)
   ret i64 %udiv
 }
 
-declare i32 @llvm.arm64.sdiv.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.sdiv.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.udiv.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.udiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.sdiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.sdiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.udiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.aarch64.udiv.i64(i64, i64) nounwind readnone
 
 ; 32-bit not.
 define i32 @inv_32(i32 %x) nounwind ssp {
diff --git a/llvm/test/CodeGen/ARM64/arm64-dead-def-elimination-flag.ll b/llvm/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
similarity index 80%
rename from llvm/test/CodeGen/ARM64/arm64-dead-def-elimination-flag.ll
rename to llvm/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
index babf482..0904b62 100644
--- a/llvm/test/CodeGen/ARM64/arm64-dead-def-elimination-flag.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-arm64-dead-def-elimination-flag.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-dead-def-elimination=false < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-dead-def-elimination=false < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios7.0.0"
diff --git a/llvm/test/CodeGen/ARM64/atomic-128.ll b/llvm/test/CodeGen/AArch64/arm64-atomic-128.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/atomic-128.ll
rename to llvm/test/CodeGen/AArch64/arm64-atomic-128.ll
diff --git a/llvm/test/CodeGen/ARM64/atomic.ll b/llvm/test/CodeGen/AArch64/arm64-atomic.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/atomic.ll
rename to llvm/test/CodeGen/AArch64/arm64-atomic.ll
diff --git a/llvm/test/CodeGen/ARM64/basic-pic.ll b/llvm/test/CodeGen/AArch64/arm64-basic-pic.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/basic-pic.ll
rename to llvm/test/CodeGen/AArch64/arm64-basic-pic.ll
diff --git a/llvm/test/CodeGen/ARM64/big-endian-bitconverts.ll b/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/big-endian-bitconverts.ll
rename to llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
index cb8708b..f0e968b 100644
--- a/llvm/test/CodeGen/ARM64/big-endian-bitconverts.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-big-endian-bitconverts.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -O1 -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O1 -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -O0 -fast-isel=true -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 define void @test_i64_f64(double* %p, i64* %q) {
diff --git a/llvm/test/CodeGen/ARM64/big-endian-eh.ll b/llvm/test/CodeGen/AArch64/arm64-big-endian-eh.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/big-endian-eh.ll
rename to llvm/test/CodeGen/AArch64/arm64-big-endian-eh.ll
diff --git a/llvm/test/CodeGen/ARM64/big-endian-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/big-endian-varargs.ll
rename to llvm/test/CodeGen/AArch64/arm64-big-endian-varargs.ll
diff --git a/llvm/test/CodeGen/ARM64/big-endian-vector-callee.ll b/llvm/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/big-endian-vector-callee.ll
rename to llvm/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
index 5b9ccac..1dcccf1 100644
--- a/llvm/test/CodeGen/ARM64/big-endian-vector-callee.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-big-endian-vector-callee.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -fast-isel=true -arm64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -fast-isel=true -aarch64-load-store-opt=false -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 define i64 @test_i64_f64(double %p) {
diff --git a/llvm/test/CodeGen/ARM64/big-endian-vector-caller.ll b/llvm/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
similarity index 99%
rename from llvm/test/CodeGen/ARM64/big-endian-vector-caller.ll
rename to llvm/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
index 194a321..9a12b7a 100644
--- a/llvm/test/CodeGen/ARM64/big-endian-vector-caller.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-big-endian-vector-caller.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -o - | FileCheck %s
-; RUN: llc -mtriple arm64_be < %s -arm64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -o - | FileCheck %s
+; RUN: llc -mtriple arm64_be < %s -aarch64-load-store-opt=false -fast-isel=true -O0 -o - | FileCheck %s
 
 ; CHECK-LABEL: test_i64_f64:
 declare i64 @test_i64_f64_helper(double %p)
diff --git a/llvm/test/CodeGen/ARM64/big-imm-offsets.ll b/llvm/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/big-imm-offsets.ll
rename to llvm/test/CodeGen/AArch64/arm64-big-imm-offsets.ll
diff --git a/llvm/test/CodeGen/ARM64/big-stack.ll b/llvm/test/CodeGen/AArch64/arm64-big-stack.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/big-stack.ll
rename to llvm/test/CodeGen/AArch64/arm64-big-stack.ll
diff --git a/llvm/test/CodeGen/ARM64/bitfield-extract.ll b/llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/bitfield-extract.ll
rename to llvm/test/CodeGen/AArch64/arm64-bitfield-extract.ll
diff --git a/llvm/test/CodeGen/ARM64/blockaddress.ll b/llvm/test/CodeGen/AArch64/arm64-blockaddress.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/blockaddress.ll
rename to llvm/test/CodeGen/AArch64/arm64-blockaddress.ll
diff --git a/llvm/test/CodeGen/ARM64/build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
similarity index 94%
rename from llvm/test/CodeGen/ARM64/build-vector.ll
rename to llvm/test/CodeGen/AArch64/arm64-build-vector.ll
index 143d689..c109263 100644
--- a/llvm/test/CodeGen/ARM64/build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; Check that building up a vector w/ only one non-zero lane initializes
 ; intelligently.
diff --git a/llvm/test/CodeGen/ARM64/call-tailcalls.ll b/llvm/test/CodeGen/AArch64/arm64-call-tailcalls.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/call-tailcalls.ll
rename to llvm/test/CodeGen/AArch64/arm64-call-tailcalls.ll
diff --git a/llvm/test/CodeGen/ARM64/cast-opt.ll b/llvm/test/CodeGen/AArch64/arm64-cast-opt.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/cast-opt.ll
rename to llvm/test/CodeGen/AArch64/arm64-cast-opt.ll
diff --git a/llvm/test/CodeGen/ARM64/ccmp-heuristics.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/ccmp-heuristics.ll
rename to llvm/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
index 5575997..664a26c 100644
--- a/llvm/test/CodeGen/ARM64/ccmp-heuristics.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp-heuristics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp | FileCheck %s
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp | FileCheck %s
 target triple = "arm64-apple-ios7.0.0"
 
 @channelColumns = external global i64
diff --git a/llvm/test/CodeGen/ARM64/ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/ccmp.ll
rename to llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index 79e6f94..63965f9 100644
--- a/llvm/test/CodeGen/ARM64/ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp -arm64-stress-ccmp | FileCheck %s
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -aarch64-ccmp -aarch64-stress-ccmp | FileCheck %s
 target triple = "arm64-apple-ios"
 
 ; CHECK: single_same
diff --git a/llvm/test/CodeGen/ARM64/clrsb.ll b/llvm/test/CodeGen/AArch64/arm64-clrsb.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/clrsb.ll
rename to llvm/test/CodeGen/AArch64/arm64-clrsb.ll
diff --git a/llvm/test/CodeGen/ARM64/coalesce-ext.ll b/llvm/test/CodeGen/AArch64/arm64-coalesce-ext.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/coalesce-ext.ll
rename to llvm/test/CodeGen/AArch64/arm64-coalesce-ext.ll
diff --git a/llvm/test/CodeGen/ARM64/code-model-large-abs.ll b/llvm/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/code-model-large-abs.ll
rename to llvm/test/CodeGen/AArch64/arm64-code-model-large-abs.ll
diff --git a/llvm/test/CodeGen/ARM64/collect-loh-garbage-crash.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
similarity index 90%
rename from llvm/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
rename to llvm/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
index 98cb625..81cee38 100644
--- a/llvm/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh-garbage-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O3 -arm64-collect-loh -arm64-collect-loh-bb-only=true -arm64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -O3 -aarch64-collect-loh -aarch64-collect-loh-bb-only=true -aarch64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
 ; Check that the LOH analysis does not crash when the analysed chained
 ; contains instructions that are filtered out.
 ;
diff --git a/llvm/test/CodeGen/ARM64/collect-loh-str.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
similarity index 86%
rename from llvm/test/CodeGen/ARM64/collect-loh-str.ll
rename to llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
index fc63f8b..d7bc00e 100644
--- a/llvm/test/CodeGen/ARM64/collect-loh-str.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh-str.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
 ; Test case for <rdar://problem/15942912>.
 ; AdrpAddStr cannot be used when the store uses same
 ; register as address and value. Indeed, the related
diff --git a/llvm/test/CodeGen/ARM64/collect-loh.ll b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
similarity index 85%
rename from llvm/test/CodeGen/ARM64/collect-loh.ll
rename to llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
index e92690b..6d73daa 100644
--- a/llvm/test/CodeGen/ARM64/collect-loh.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-collect-loh.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
-; RUN: llc -mtriple=arm64-linux-gnu -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
+; RUN: llc -mtriple=arm64-apple-ios -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O2 -aarch64-collect-loh -aarch64-collect-loh-bb-only=false < %s -o - | FileCheck %s --check-prefix=CHECK-ELF
 
 ; CHECK-ELF-NOT: .loh
 ; CHECK-ELF-NOT: AdrpAdrp
diff --git a/llvm/test/CodeGen/ARM64/complex-copy-noneon.ll b/llvm/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/complex-copy-noneon.ll
rename to llvm/test/CodeGen/AArch64/arm64-complex-copy-noneon.ll
diff --git a/llvm/test/CodeGen/ARM64/complex-ret.ll b/llvm/test/CodeGen/AArch64/arm64-complex-ret.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/complex-ret.ll
rename to llvm/test/CodeGen/AArch64/arm64-complex-ret.ll
diff --git a/llvm/test/CodeGen/ARM64/const-addr.ll b/llvm/test/CodeGen/AArch64/arm64-const-addr.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/const-addr.ll
rename to llvm/test/CodeGen/AArch64/arm64-const-addr.ll
diff --git a/llvm/test/CodeGen/ARM64/convert-v2f64-v2i32.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
similarity index 86%
rename from llvm/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
rename to llvm/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
index 1a07c98..d862b1e 100644
--- a/llvm/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v2f64-v2i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; CHECK: fptosi_1
 ; CHECK: fcvtzs.2d
diff --git a/llvm/test/CodeGen/ARM64/convert-v2i32-v2f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
similarity index 90%
rename from llvm/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
rename to llvm/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
index 63129a4..daaf1e0 100644
--- a/llvm/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v2i32-v2f64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
 ; CHECK-LABEL: f1:
diff --git a/llvm/test/CodeGen/ARM64/copy-tuple.ll b/llvm/test/CodeGen/AArch64/arm64-copy-tuple.ll
similarity index 69%
rename from llvm/test/CodeGen/ARM64/copy-tuple.ll
rename to llvm/test/CodeGen/AArch64/arm64-copy-tuple.ll
index f998193..1803787 100644
--- a/llvm/test/CodeGen/ARM64/copy-tuple.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-copy-tuple.ll
@@ -13,14 +13,14 @@
 ; CHECK: mov.8b v1, v0
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
@@ -30,14 +30,14 @@
 ; CHECK: mov.8b v1, v2
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
@@ -47,14 +47,14 @@
 ; CHECK: mov.8b v0, v31
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
@@ -64,14 +64,14 @@
 ; CHECK: mov.8b v0, v1
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
 
   tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
-  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  tail call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
   ret void
 }
 
@@ -82,16 +82,16 @@
 ; CHECK: mov.8b v2, v0
 entry:
   %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
-  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
   %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
   %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
   %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
 
   tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
 
   tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
   ret void
 }
 
@@ -102,15 +102,15 @@
 ; CHECK: mov.16b v2, v3
 entry:
   %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
-  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
   %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
   %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
   %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
   tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
 
   tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  tail call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
   ret void
 }
 
@@ -121,26 +121,26 @@
 ; CHECK: mov.16b v2, v31
 ; CHECK: mov.16b v1, v30
   %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
-  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
   %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
   %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
   %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
   %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
 
   tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
-  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
 
   tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
-  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  tail call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
   ret void
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
 
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
diff --git a/llvm/test/CodeGen/AArch64/arm64-crc32.ll b/llvm/test/CodeGen/AArch64/arm64-crc32.ll
new file mode 100644
index 0000000..d3099e6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-crc32.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=arm64 -mattr=+crc -o - %s | FileCheck %s
+
+define i32 @test_crc32b(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32b:
+; CHECK: crc32b w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.aarch64.crc32b(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32h(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32h:
+; CHECK: crc32h w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.aarch64.crc32h(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32w(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32w:
+; CHECK: crc32w w0, w0, w1
+  %val = call i32 @llvm.aarch64.crc32w(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32x(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32x:
+; CHECK: crc32x w0, w0, x1
+  %val = call i32 @llvm.aarch64.crc32x(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cb(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32cb:
+; CHECK: crc32cb w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.aarch64.crc32cb(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32ch(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32ch:
+; CHECK: crc32ch w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.aarch64.crc32ch(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32cw(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32cw:
+; CHECK: crc32cw w0, w0, w1
+  %val = call i32 @llvm.aarch64.crc32cw(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cx(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32cx:
+; CHECK: crc32cx w0, w0, x1
+  %val = call i32 @llvm.aarch64.crc32cx(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+declare i32 @llvm.aarch64.crc32b(i32, i32)
+declare i32 @llvm.aarch64.crc32h(i32, i32)
+declare i32 @llvm.aarch64.crc32w(i32, i32)
+declare i32 @llvm.aarch64.crc32x(i32, i64)
+
+declare i32 @llvm.aarch64.crc32cb(i32, i32)
+declare i32 @llvm.aarch64.crc32ch(i32, i32)
+declare i32 @llvm.aarch64.crc32cw(i32, i32)
+declare i32 @llvm.aarch64.crc32cx(i32, i64)
diff --git a/llvm/test/CodeGen/AArch64/arm64-crypto.ll b/llvm/test/CodeGen/AArch64/arm64-crypto.ll
new file mode 100644
index 0000000..2908b33
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-crypto.ll
@@ -0,0 +1,135 @@
+; RUN: llc -march=arm64 -mattr=crypto -aarch64-neon-syntax=apple -o - %s | FileCheck %s
+
+declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
+declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
+
+define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aese:
+; CHECK: aese.16b v0, v1
+  %res = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aesd:
+; CHECK: aesd.16b v0, v1
+  %res = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesmc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesmc:
+; CHECK: aesmc.16b v0, v0
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesimc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesimc:
+; CHECK: aesimc.16b v0, v0
+ %res = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+declare <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+declare <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+
+define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1
+define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c_in_a_row:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
+; CHECK-NOT: fmov
+; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %extract = extractelement <4 x i32> %res, i32 0
+  %res2 = call <4 x i32> @llvm.aarch64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
+  ret <4 x i32> %res2
+}
+
+define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1p:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1p.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1m:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1m.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define i32 @test_sha1h(i32 %hash_e) {
+; CHECK-LABEL: test_sha1h:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
+; CHECK: fmov w0, [[RES]]
+  %res = call i32 @llvm.aarch64.crypto.sha1h(i32 %hash_e)
+  ret i32 %res
+}
+
+define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
+; CHECK-LABEL: test_sha1su0:
+; CHECK: sha1su0.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
+; CHECK-LABEL: test_sha1su1:
+; CHECK: sha1su1.4s v0, v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+declare <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+
+define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h:
+; CHECK: sha256h.4s q0, q1, v2
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h2:
+; CHECK: sha256h2.4s q0, q1, v2
+
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
+; CHECK-LABEL: test_sha256su0:
+; CHECK: sha256su0.4s v0, v1
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
+; CHECK-LABEL: test_sha256su1:
+; CHECK: sha256su1.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.aarch64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/ARM64/cse.ll b/llvm/test/CodeGen/AArch64/arm64-cse.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/cse.ll
rename to llvm/test/CodeGen/AArch64/arm64-cse.ll
diff --git a/llvm/test/CodeGen/ARM64/csel.ll b/llvm/test/CodeGen/AArch64/arm64-csel.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/csel.ll
rename to llvm/test/CodeGen/AArch64/arm64-csel.ll
diff --git a/llvm/test/CodeGen/AArch64/arm64-cvt.ll b/llvm/test/CodeGen/AArch64/arm64-cvt.ll
new file mode 100644
index 0000000..420a8bc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-cvt.ll
@@ -0,0 +1,401 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to away)
+;
+define i32 @fcvtas_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1s:
+;CHECK: fcvtas w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1s:
+;CHECK: fcvtas x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtas_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1d:
+;CHECK: fcvtas w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtas.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1d:
+;CHECK: fcvtas x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtas.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtas.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer
+;
+define i32 @fcvtau_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1s:
+;CHECK: fcvtau w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1s:
+;CHECK: fcvtau x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtau_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1d:
+;CHECK: fcvtau w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtau.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1d:
+;CHECK: fcvtau x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtau.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtau.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward -Inf)
+;
+define i32 @fcvtms_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1s:
+;CHECK: fcvtms w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1s:
+;CHECK: fcvtms x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtms_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1d:
+;CHECK: fcvtms w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtms.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1d:
+;CHECK: fcvtms x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtms.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtms.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward -Inf)
+;
+define i32 @fcvtmu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1s:
+;CHECK: fcvtmu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1s:
+;CHECK: fcvtmu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtmu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1d:
+;CHECK: fcvtmu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1d:
+;CHECK: fcvtmu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtmu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to even)
+;
+define i32 @fcvtns_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1s:
+;CHECK: fcvtns w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1s:
+;CHECK: fcvtns x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtns_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1d:
+;CHECK: fcvtns w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtns.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1d:
+;CHECK: fcvtns x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtns.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtns.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
+;
+define i32 @fcvtnu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1s:
+;CHECK: fcvtnu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1s:
+;CHECK: fcvtnu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtnu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1d:
+;CHECK: fcvtnu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1d:
+;CHECK: fcvtnu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtnu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward +Inf)
+;
+define i32 @fcvtps_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1s:
+;CHECK: fcvtps w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1s:
+;CHECK: fcvtps x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtps_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1d:
+;CHECK: fcvtps w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtps.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1d:
+;CHECK: fcvtps x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtps.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtps.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward +Inf)
+;
+define i32 @fcvtpu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1s:
+;CHECK: fcvtpu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1s:
+;CHECK: fcvtpu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtpu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1d:
+;CHECK: fcvtpu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1d:
+;CHECK: fcvtpu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtpu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double) nounwind readnone
+
+;
+;  Floating-point scalar convert to signed integer (toward zero)
+;
+define i32 @fcvtzs_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1s:
+;CHECK: fcvtzs w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1s:
+;CHECK: fcvtzs x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzs_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1d:
+;CHECK: fcvtzs w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1d:
+;CHECK: fcvtzs x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzs.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward zero)
+;
+define i32 @fcvtzu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1s:
+;CHECK: fcvtzu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1s:
+;CHECK: fcvtzu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1d:
+;CHECK: fcvtzu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1d:
+;CHECK: fcvtzu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f32(float) nounwind readnone
+declare i32 @llvm.aarch64.neon.fcvtzu.i32.f64(double) nounwind readnone
+declare i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/dagcombiner-convergence.ll b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/dagcombiner-convergence.ll
rename to llvm/test/CodeGen/AArch64/arm64-dagcombiner-convergence.ll
diff --git a/llvm/test/CodeGen/ARM64/dagcombiner-dead-indexed-load.ll b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/dagcombiner-dead-indexed-load.ll
rename to llvm/test/CodeGen/AArch64/arm64-dagcombiner-dead-indexed-load.ll
diff --git a/llvm/test/CodeGen/ARM64/dagcombiner-indexed-load.ll b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/dagcombiner-indexed-load.ll
rename to llvm/test/CodeGen/AArch64/arm64-dagcombiner-indexed-load.ll
diff --git a/llvm/test/CodeGen/ARM64/dagcombiner-load-slicing.ll b/llvm/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
rename to llvm/test/CodeGen/AArch64/arm64-dagcombiner-load-slicing.ll
diff --git a/llvm/test/CodeGen/ARM64/dead-def-frame-index.ll b/llvm/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/dead-def-frame-index.ll
rename to llvm/test/CodeGen/AArch64/arm64-dead-def-frame-index.ll
diff --git a/llvm/test/CodeGen/ARM64/dead-register-def-bug.ll b/llvm/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/dead-register-def-bug.ll
rename to llvm/test/CodeGen/AArch64/arm64-dead-register-def-bug.ll
diff --git a/llvm/test/CodeGen/ARM64/dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/dup.ll
rename to llvm/test/CodeGen/AArch64/arm64-dup.ll
index 97774a7..0c56b46 100644
--- a/llvm/test/CodeGen/ARM64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 define <8 x i8> @v_dup8(i8 %A) nounwind {
 ;CHECK-LABEL: v_dup8:
diff --git a/llvm/test/CodeGen/ARM64/early-ifcvt.ll b/llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/early-ifcvt.ll
rename to llvm/test/CodeGen/AArch64/arm64-early-ifcvt.ll
diff --git a/llvm/test/CodeGen/ARM64/elf-calls.ll b/llvm/test/CodeGen/AArch64/arm64-elf-calls.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/elf-calls.ll
rename to llvm/test/CodeGen/AArch64/arm64-elf-calls.ll
diff --git a/llvm/test/CodeGen/ARM64/elf-constpool.ll b/llvm/test/CodeGen/AArch64/arm64-elf-constpool.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/elf-constpool.ll
rename to llvm/test/CodeGen/AArch64/arm64-elf-constpool.ll
diff --git a/llvm/test/CodeGen/ARM64/elf-globals.ll b/llvm/test/CodeGen/AArch64/arm64-elf-globals.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/elf-globals.ll
rename to llvm/test/CodeGen/AArch64/arm64-elf-globals.ll
diff --git a/llvm/test/CodeGen/ARM64/ext.ll b/llvm/test/CodeGen/AArch64/arm64-ext.ll
similarity index 97%
rename from llvm/test/CodeGen/ARM64/ext.ll
rename to llvm/test/CodeGen/AArch64/arm64-ext.ll
index d368eef..67860de 100644
--- a/llvm/test/CodeGen/ARM64/ext.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: test_vextd:
diff --git a/llvm/test/CodeGen/ARM64/extend-int-to-fp.ll b/llvm/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
similarity index 85%
rename from llvm/test/CodeGen/ARM64/extend-int-to-fp.ll
rename to llvm/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
index 599a697..048fdb0 100644
--- a/llvm/test/CodeGen/ARM64/extend-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extend-int-to-fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <4 x float> @foo(<4 x i16> %a) nounwind {
 ; CHECK-LABEL: foo:
diff --git a/llvm/test/CodeGen/ARM64/extend.ll b/llvm/test/CodeGen/AArch64/arm64-extend.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/extend.ll
rename to llvm/test/CodeGen/AArch64/arm64-extend.ll
diff --git a/llvm/test/CodeGen/ARM64/extern-weak.ll b/llvm/test/CodeGen/AArch64/arm64-extern-weak.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/extern-weak.ll
rename to llvm/test/CodeGen/AArch64/arm64-extern-weak.ll
diff --git a/llvm/test/CodeGen/ARM64/extload-knownzero.ll b/llvm/test/CodeGen/AArch64/arm64-extload-knownzero.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/extload-knownzero.ll
rename to llvm/test/CodeGen/AArch64/arm64-extload-knownzero.ll
diff --git a/llvm/test/CodeGen/ARM64/extract.ll b/llvm/test/CodeGen/AArch64/arm64-extract.ll
similarity index 94%
rename from llvm/test/CodeGen/ARM64/extract.ll
rename to llvm/test/CodeGen/AArch64/arm64-extract.ll
index 02e4218..0198466 100644
--- a/llvm/test/CodeGen/ARM64/extract.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc -arm64-extr-generation=true -verify-machineinstrs < %s \
+; RUN: llc -aarch64-extr-generation=true -verify-machineinstrs < %s \
 ; RUN: -march=arm64 | FileCheck %s
 
 define i64 @ror_i64(i64 %in) {
diff --git a/llvm/test/CodeGen/ARM64/extract_subvector.ll b/llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll
similarity index 94%
rename from llvm/test/CodeGen/ARM64/extract_subvector.ll
rename to llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll
index 20c05fb..8b15a64 100644
--- a/llvm/test/CodeGen/ARM64/extract_subvector.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-extract_subvector.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 ; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
 
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-addr-offset.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-addr-offset.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-addr-offset.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-alloca.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-alloca.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-alloca.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-br.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-br.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-br.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-br.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-call.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-call.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-call.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-call.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-conversion.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-conversion.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-fcmp.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-fcmp.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-fcmp.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-gv.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-gv.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-gv.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-icmp.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-icmp.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-icmp.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-indirectbr.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-indirectbr.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-indirectbr.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-intrinsic.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-intrinsic.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-intrinsic.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-materialize.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-materialize.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-materialize.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-noconvert.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-noconvert.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-noconvert.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-rem.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-rem.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-rem.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-ret.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-ret.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-ret.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel-select.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-select.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel-select.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel-select.ll
diff --git a/llvm/test/CodeGen/ARM64/fast-isel.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fast-isel.ll
rename to llvm/test/CodeGen/AArch64/arm64-fast-isel.ll
diff --git a/llvm/test/CodeGen/ARM64/fastcc-tailcall.ll b/llvm/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fastcc-tailcall.ll
rename to llvm/test/CodeGen/AArch64/arm64-fastcc-tailcall.ll
diff --git a/llvm/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll b/llvm/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
rename to llvm/test/CodeGen/AArch64/arm64-fastisel-gep-promote-before-add.ll
diff --git a/llvm/test/CodeGen/ARM64/fcmp-opt.ll b/llvm/test/CodeGen/AArch64/arm64-fcmp-opt.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/fcmp-opt.ll
rename to llvm/test/CodeGen/AArch64/arm64-fcmp-opt.ll
index e79eb9c..41027d4 100644
--- a/llvm/test/CodeGen/ARM64/fcmp-opt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fcmp-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -aarch64-neon-syntax=apple | FileCheck %s
 ; rdar://10263824
 
 define i1 @fcmp_float1(float %a) nounwind ssp {
diff --git a/llvm/test/CodeGen/ARM64/fcopysign.ll b/llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fcopysign.ll
rename to llvm/test/CodeGen/AArch64/arm64-fcopysign.ll
diff --git a/llvm/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll b/llvm/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
similarity index 62%
rename from llvm/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
rename to llvm/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
index 77981f2..e51c38b2 100644
--- a/llvm/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fixed-point-scalar-cvt-dagcombine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; DAGCombine to transform a conversion of an extract_vector_elt to an
 ; extract_vector_elt of a conversion, which saves a round trip of copies
@@ -8,8 +8,8 @@
 ; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
 ; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
   %vecext = extractelement <2 x i64> %a, i32 1
-  %fcvt_n = tail call double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
+  %fcvt_n = tail call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
   ret double %fcvt_n
 }
 
-declare double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
+declare double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/fmadd.ll b/llvm/test/CodeGen/AArch64/arm64-fmadd.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fmadd.ll
rename to llvm/test/CodeGen/AArch64/arm64-fmadd.ll
diff --git a/llvm/test/CodeGen/ARM64/fmax.ll b/llvm/test/CodeGen/AArch64/arm64-fmax.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fmax.ll
rename to llvm/test/CodeGen/AArch64/arm64-fmax.ll
diff --git a/llvm/test/CodeGen/AArch64/arm64-fminv.ll b/llvm/test/CodeGen/AArch64/arm64-fminv.ll
new file mode 100644
index 0000000..f4c9735
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-fminv.ll
@@ -0,0 +1,101 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+define float @test_fminv_v2f32(<2 x float> %in) {
+; CHECK: test_fminv_v2f32:
+; CHECK: fminp s0, v0.2s
+  %min = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> %in)
+  ret float %min
+}
+
+define float @test_fminv_v4f32(<4 x float> %in) {
+; CHECK: test_fminv_v4f32:
+; CHECK: fminv s0, v0.4s
+  %min = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %in)
+  ret float %min
+}
+
+define double @test_fminv_v2f64(<2 x double> %in) {
+; CHECK: test_fminv_v2f64:
+; CHECK: fminp d0, v0.2d
+  %min = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxv_v2f32:
+; CHECK: fmaxp s0, v0.2s
+  %max = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %in)
+  ret float %max
+}
+
+define float @test_fmaxv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxv_v4f32:
+; CHECK: fmaxv s0, v0.4s
+  %max = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %in)
+  ret float %max
+}
+
+define double @test_fmaxv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxv_v2f64:
+; CHECK: fmaxp d0, v0.2d
+  %max = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double>)
+
+define float @test_fminnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fminnmv_v2f32:
+; CHECK: fminnmp s0, v0.2s
+  %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> %in)
+  ret float %minnm
+}
+
+define float @test_fminnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fminnmv_v4f32:
+; CHECK: fminnmv s0, v0.4s
+  %minnm = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %in)
+  ret float %minnm
+}
+
+define double @test_fminnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fminnmv_v2f64:
+; CHECK: fminnmp d0, v0.2d
+  %minnm = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %minnm
+}
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
+
+define float @test_fmaxnmv_v2f32(<2 x float> %in) {
+; CHECK: test_fmaxnmv_v2f32:
+; CHECK: fmaxnmp s0, v0.2s
+  %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
+  ret float %maxnm
+}
+
+define float @test_fmaxnmv_v4f32(<4 x float> %in) {
+; CHECK: test_fmaxnmv_v4f32:
+; CHECK: fmaxnmv s0, v0.4s
+  %maxnm = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
+  ret float %maxnm
+}
+
+define double @test_fmaxnmv_v2f64(<2 x double> %in) {
+; CHECK: test_fmaxnmv_v2f64:
+; CHECK: fmaxnmp d0, v0.2d
+  %maxnm = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %maxnm
+}
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/llvm/test/CodeGen/ARM64/fmuladd.ll b/llvm/test/CodeGen/AArch64/arm64-fmuladd.ll
similarity index 96%
rename from llvm/test/CodeGen/ARM64/fmuladd.ll
rename to llvm/test/CodeGen/AArch64/arm64-fmuladd.ll
index 174d830..6c5eeca 100644
--- a/llvm/test/CodeGen/ARM64/fmuladd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fmuladd.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define float @test_f32(float* %A, float* %B, float* %C) nounwind {
 ;CHECK-LABEL: test_f32:
diff --git a/llvm/test/CodeGen/ARM64/fold-address.ll b/llvm/test/CodeGen/AArch64/arm64-fold-address.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fold-address.ll
rename to llvm/test/CodeGen/AArch64/arm64-fold-address.ll
diff --git a/llvm/test/CodeGen/ARM64/fold-lsl.ll b/llvm/test/CodeGen/AArch64/arm64-fold-lsl.ll
similarity index 96%
rename from llvm/test/CodeGen/ARM64/fold-lsl.ll
rename to llvm/test/CodeGen/AArch64/arm64-fold-lsl.ll
index 2e5762d..ec65e46 100644
--- a/llvm/test/CodeGen/ARM64/fold-lsl.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fold-lsl.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 ;
 ; <rdar://problem/14486451>
 
diff --git a/llvm/test/CodeGen/ARM64/fp-contract-zero.ll b/llvm/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fp-contract-zero.ll
rename to llvm/test/CodeGen/AArch64/arm64-fp-contract-zero.ll
diff --git a/llvm/test/CodeGen/ARM64/fp-imm.ll b/llvm/test/CodeGen/AArch64/arm64-fp-imm.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fp-imm.ll
rename to llvm/test/CodeGen/AArch64/arm64-fp-imm.ll
diff --git a/llvm/test/CodeGen/ARM64/fp.ll b/llvm/test/CodeGen/AArch64/arm64-fp.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fp.ll
rename to llvm/test/CodeGen/AArch64/arm64-fp.ll
diff --git a/llvm/test/CodeGen/ARM64/fp128-folding.ll b/llvm/test/CodeGen/AArch64/arm64-fp128-folding.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fp128-folding.ll
rename to llvm/test/CodeGen/AArch64/arm64-fp128-folding.ll
diff --git a/llvm/test/CodeGen/ARM64/fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/fp128.ll
rename to llvm/test/CodeGen/AArch64/arm64-fp128.ll
diff --git a/llvm/test/CodeGen/ARM64/frame-index.ll b/llvm/test/CodeGen/AArch64/arm64-frame-index.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/frame-index.ll
rename to llvm/test/CodeGen/AArch64/arm64-frame-index.ll
diff --git a/llvm/test/CodeGen/ARM64/frameaddr.ll b/llvm/test/CodeGen/AArch64/arm64-frameaddr.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/frameaddr.ll
rename to llvm/test/CodeGen/AArch64/arm64-frameaddr.ll
diff --git a/llvm/test/CodeGen/ARM64/global-address.ll b/llvm/test/CodeGen/AArch64/arm64-global-address.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/global-address.ll
rename to llvm/test/CodeGen/AArch64/arm64-global-address.ll
diff --git a/llvm/test/CodeGen/ARM64/hello.ll b/llvm/test/CodeGen/AArch64/arm64-hello.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/hello.ll
rename to llvm/test/CodeGen/AArch64/arm64-hello.ll
diff --git a/llvm/test/CodeGen/ARM64/i16-subreg-extract.ll b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
similarity index 80%
rename from llvm/test/CodeGen/ARM64/i16-subreg-extract.ll
rename to llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
index fc2e8b5..ba759e3 100644
--- a/llvm/test/CodeGen/ARM64/i16-subreg-extract.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-i16-subreg-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @foo(<4 x i16>* %__a) nounwind {
 ; CHECK-LABEL: foo:
diff --git a/llvm/test/CodeGen/ARM64/icmp-opt.ll b/llvm/test/CodeGen/AArch64/arm64-icmp-opt.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/icmp-opt.ll
rename to llvm/test/CodeGen/AArch64/arm64-icmp-opt.ll
diff --git a/llvm/test/CodeGen/ARM64/illegal-float-ops.ll b/llvm/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/illegal-float-ops.ll
rename to llvm/test/CodeGen/AArch64/arm64-illegal-float-ops.ll
diff --git a/llvm/test/CodeGen/ARM64/indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/indexed-memory.ll
rename to llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index e390ed7..e501c6e 100644
--- a/llvm/test/CodeGen/ARM64/indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
 
 define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
 ; CHECK-LABEL: store64:
diff --git a/llvm/test/CodeGen/ARM64/indexed-vector-ldst-2.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/indexed-vector-ldst-2.ll
rename to llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
diff --git a/llvm/test/CodeGen/ARM64/indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
similarity index 75%
rename from llvm/test/CodeGen/ARM64/indexed-vector-ldst.ll
rename to llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 3d8ff53..9ee4063 100644
--- a/llvm/test/CodeGen/ARM64/indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -615,7 +615,7 @@
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld2:
 ;CHECK: ld2.16b { v0, v1 }, [x0], #32
-  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
@@ -624,19 +624,19 @@
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld2:
 ;CHECK: ld2.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld2:
 ;CHECK: ld2.8b { v0, v1 }, [x0], #16
-  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
+  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 16
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
@@ -645,19 +645,19 @@
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld2:
 ;CHECK: ld2.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
+  %ld2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld2:
 ;CHECK: ld2.8h { v0, v1 }, [x0], #32
-  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
+  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
@@ -666,19 +666,19 @@
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld2:
 ;CHECK: ld2.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
+  %ld2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
 }
 
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld2:
 ;CHECK: ld2.4h { v0, v1 }, [x0], #16
-  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
+  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 8
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
@@ -687,19 +687,19 @@
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld2:
 ;CHECK: ld2.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
+  %ld2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
 }
 
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld2:
 ;CHECK: ld2.4s { v0, v1 }, [x0], #32
-  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
+  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
@@ -708,19 +708,19 @@
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld2:
 ;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
+  %ld2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
 }
 
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld2:
 ;CHECK: ld2.2s { v0, v1 }, [x0], #16
-  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
+  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
@@ -729,19 +729,19 @@
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld2:
 ;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
+  %ld2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld2:
 ;CHECK: ld2.2d { v0, v1 }, [x0], #32
-  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
+  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
@@ -750,19 +750,19 @@
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld2:
 ;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
+  %ld2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
 }
 
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
+  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
@@ -771,19 +771,19 @@
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
+  %ld2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
 }
 
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld2:
 ;CHECK: ld2.4s { v0, v1 }, [x0], #32
-  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float* %A)
+  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 8
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
@@ -792,19 +792,19 @@
 define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld2:
 ;CHECK: ld2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float* %A)
+  %ld2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
 }
 
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld2:
 ;CHECK: ld2.2s { v0, v1 }, [x0], #16
-  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float* %A)
+  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
@@ -813,19 +813,19 @@
 define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld2:
 ;CHECK: ld2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float* %A)
+  %ld2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
 }
 
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld2:
 ;CHECK: ld2.2d { v0, v1 }, [x0], #32
-  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double* %A)
+  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
@@ -834,19 +834,19 @@
 define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld2:
 ;CHECK: ld2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double* %A)
+  %ld2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
 }
 
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
+  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
@@ -855,19 +855,19 @@
 define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
+  %ld2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
 }
 
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0f64(double*)
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld3:
 ;CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
+  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 48
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
@@ -876,19 +876,19 @@
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld3:
 ;CHECK: ld3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
+  %ld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld3:
 ;CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
+  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 24
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
@@ -897,19 +897,19 @@
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld3:
 ;CHECK: ld3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
+  %ld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld3:
 ;CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
+  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 24
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
@@ -918,19 +918,19 @@
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld3:
 ;CHECK: ld3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
+  %ld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld3:
 ;CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
+  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 12
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
@@ -939,19 +939,19 @@
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld3:
 ;CHECK: ld3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
+  %ld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld3:
 ;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
+  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 12
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
@@ -960,19 +960,19 @@
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld3:
 ;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
+  %ld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld3:
 ;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
+  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 6
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
@@ -981,19 +981,19 @@
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld3:
 ;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
+  %ld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld3:
 ;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
+  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 6
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
@@ -1002,19 +1002,19 @@
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld3:
 ;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
+  %ld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
+  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
@@ -1023,19 +1023,19 @@
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
+  %ld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld3:
 ;CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %A)
+  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 12
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
@@ -1044,19 +1044,19 @@
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld3:
 ;CHECK: ld3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %A)
+  %ld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
 }
 
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld3:
 ;CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float* %A)
+  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 6
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
@@ -1065,19 +1065,19 @@
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld3:
 ;CHECK: ld3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float* %A)
+  %ld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
 }
 
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld3:
 ;CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
-  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double* %A)
+  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 6
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
@@ -1086,19 +1086,19 @@
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld3:
 ;CHECK: ld3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double* %A)
+  %ld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
 }
 
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
+  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
@@ -1107,19 +1107,19 @@
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
+  %ld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
 }
 
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0f64(double*)
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld4:
 ;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
+  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 64
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
@@ -1128,19 +1128,19 @@
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld4:
 ;CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
+  %ld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld4:
 ;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
+  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
@@ -1149,19 +1149,19 @@
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld4:
 ;CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
+  %ld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld4:
 ;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
+  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 32
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
@@ -1170,19 +1170,19 @@
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld4:
 ;CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
+  %ld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld4:
 ;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
+  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
@@ -1191,19 +1191,19 @@
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld4:
 ;CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
+  %ld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld4:
 ;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
+  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 16
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
@@ -1212,19 +1212,19 @@
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld4:
 ;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
+  %ld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld4:
 ;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
+  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
@@ -1233,19 +1233,19 @@
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld4:
 ;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
+  %ld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld4:
 ;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
+  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 8
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
@@ -1254,19 +1254,19 @@
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld4:
 ;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
+  %ld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
+  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
@@ -1275,19 +1275,19 @@
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
+  %ld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld4:
 ;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float* %A)
+  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 16
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
@@ -1296,19 +1296,19 @@
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld4:
 ;CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float* %A)
+  %ld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
 }
 
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld4:
 ;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float* %A)
+  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 8
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
@@ -1317,19 +1317,19 @@
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld4:
 ;CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float* %A)
+  %ld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
 }
 
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld4:
 ;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double* %A)
+  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 8
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
@@ -1338,19 +1338,19 @@
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld4:
 ;CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double* %A)
+  %ld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
 }
 
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
+  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
@@ -1359,18 +1359,18 @@
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
+  %ld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
 }
 
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0f64(double*)
 
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld1x2:
 ;CHECK: ld1.16b { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %A)
+  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld1x2
@@ -1379,19 +1379,19 @@
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld1x2:
 ;CHECK: ld1.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %A)
+  %ld1x2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld1x2
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x2(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld1x2:
 ;CHECK: ld1.8b { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %A)
+  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 16
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld1x2
@@ -1400,19 +1400,19 @@
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x2(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld1x2:
 ;CHECK: ld1.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %A)
+  %ld1x2 = tail call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld1x2
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld1x2:
 ;CHECK: ld1.8h { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %A)
+  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld1x2
@@ -1421,19 +1421,19 @@
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld1x2:
 ;CHECK: ld1.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %A)
+  %ld1x2 = tail call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld1x2
 }
 
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x2(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld1x2:
 ;CHECK: ld1.4h { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %A)
+  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 8
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld1x2
@@ -1442,19 +1442,19 @@
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x2(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld1x2:
 ;CHECK: ld1.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %A)
+  %ld1x2 = tail call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld1x2
 }
 
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld1x2:
 ;CHECK: ld1.4s { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %A)
+  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld1x2
@@ -1463,19 +1463,19 @@
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld1x2:
 ;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %A)
+  %ld1x2 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld1x2
 }
 
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x2(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld1x2:
 ;CHECK: ld1.2s { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %A)
+  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld1x2
@@ -1484,19 +1484,19 @@
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x2(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld1x2:
 ;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %A)
+  %ld1x2 = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld1x2
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld1x2:
 ;CHECK: ld1.2d { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %A)
+  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld1x2
@@ -1505,19 +1505,19 @@
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld1x2:
 ;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %A)
+  %ld1x2 = tail call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld1x2
 }
 
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x2(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld1x2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %A)
+  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld1x2
@@ -1526,19 +1526,19 @@
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x2(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld1x2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %A)
+  %ld1x2 = tail call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld1x2
 }
 
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x2(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld1x2:
 ;CHECK: ld1.4s { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %A)
+  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 8
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld1x2
@@ -1547,19 +1547,19 @@
 define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld1x2:
 ;CHECK: ld1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %A)
+  %ld1x2 = tail call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld1x2
 }
 
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x2(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld1x2:
 ;CHECK: ld1.2s { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %A)
+  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld1x2
@@ -1568,19 +1568,19 @@
 define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x2(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld1x2:
 ;CHECK: ld1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %A)
+  %ld1x2 = tail call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld1x2
 }
 
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x2(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld1x2:
 ;CHECK: ld1.2d { v0, v1 }, [x0], #32
-  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %A)
+  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld1x2
@@ -1589,19 +1589,19 @@
 define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld1x2:
 ;CHECK: ld1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %A)
+  %ld1x2 = tail call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld1x2
 }
 
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x2(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld1x2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], #16
-  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %A)
+  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld1x2
@@ -1610,19 +1610,19 @@
 define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x2(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld1x2:
 ;CHECK: ld1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %A)
+  %ld1x2 = tail call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld1x2
 }
 
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*)
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld1x3:
 ;CHECK: ld1.16b { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %A)
+  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 48
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
@@ -1631,19 +1631,19 @@
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld1x3:
 ;CHECK: ld1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %A)
+  %ld1x3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld1x3
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x3(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld1x3:
 ;CHECK: ld1.8b { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %A)
+  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 24
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
@@ -1652,19 +1652,19 @@
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x3(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld1x3:
 ;CHECK: ld1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %A)
+  %ld1x3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld1x3
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld1x3:
 ;CHECK: ld1.8h { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %A)
+  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 24
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
@@ -1673,19 +1673,19 @@
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld1x3:
 ;CHECK: ld1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %A)
+  %ld1x3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld1x3
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x3(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld1x3:
 ;CHECK: ld1.4h { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %A)
+  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 12
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
@@ -1694,19 +1694,19 @@
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x3(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld1x3:
 ;CHECK: ld1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %A)
+  %ld1x3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld1x3
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld1x3:
 ;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %A)
+  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 12
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
@@ -1715,19 +1715,19 @@
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld1x3:
 ;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %A)
+  %ld1x3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld1x3
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x3(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld1x3:
 ;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %A)
+  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 6
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
@@ -1736,19 +1736,19 @@
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x3(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld1x3:
 ;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %A)
+  %ld1x3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld1x3
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld1x3:
 ;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %A)
+  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 6
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
@@ -1757,19 +1757,19 @@
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld1x3:
 ;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %A)
+  %ld1x3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld1x3
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x3(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld1x3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %A)
+  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
@@ -1778,19 +1778,19 @@
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x3(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld1x3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %A)
+  %ld1x3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld1x3
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x3(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld1x3:
 ;CHECK: ld1.4s { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %A)
+  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 12
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
@@ -1799,19 +1799,19 @@
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld1x3:
 ;CHECK: ld1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %A)
+  %ld1x3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld1x3
 }
 
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x3(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld1x3:
 ;CHECK: ld1.2s { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %A)
+  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 6
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
@@ -1820,19 +1820,19 @@
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x3(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld1x3:
 ;CHECK: ld1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %A)
+  %ld1x3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld1x3
 }
 
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x3(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld1x3:
 ;CHECK: ld1.2d { v0, v1, v2 }, [x0], #48
-  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %A)
+  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 6
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
@@ -1841,19 +1841,19 @@
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld1x3:
 ;CHECK: ld1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %A)
+  %ld1x3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld1x3
 }
 
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x3(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld1x3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], #24
-  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %A)
+  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
@@ -1862,19 +1862,19 @@
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x3(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld1x3:
 ;CHECK: ld1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %A)
+  %ld1x3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld1x3
 }
 
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*)
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v16i8_post_imm_ld1x4:
 ;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %A)
+  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 64
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
@@ -1883,19 +1883,19 @@
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v16i8_post_reg_ld1x4:
 ;CHECK: ld1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %A)
+  %ld1x4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld1x4
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*)
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld1x4(i8* %A, i8** %ptr) {
 ;CHECK-LABEL: test_v8i8_post_imm_ld1x4:
 ;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %A)
+  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
@@ -1904,19 +1904,19 @@
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld1x4(i8* %A, i8** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i8_post_reg_ld1x4:
 ;CHECK: ld1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %A)
+  %ld1x4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld1x4
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v8i16_post_imm_ld1x4:
 ;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %A)
+  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 32
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
@@ -1925,19 +1925,19 @@
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v8i16_post_reg_ld1x4:
 ;CHECK: ld1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %A)
+  %ld1x4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld1x4
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*)
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld1x4(i16* %A, i16** %ptr) {
 ;CHECK-LABEL: test_v4i16_post_imm_ld1x4:
 ;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %A)
+  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
@@ -1946,19 +1946,19 @@
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld1x4(i16* %A, i16** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i16_post_reg_ld1x4:
 ;CHECK: ld1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %A)
+  %ld1x4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld1x4
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*)
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v4i32_post_imm_ld1x4:
 ;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %A)
+  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 16
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
@@ -1967,19 +1967,19 @@
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4i32_post_reg_ld1x4:
 ;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %A)
+  %ld1x4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld1x4
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*)
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld1x4(i32* %A, i32** %ptr) {
 ;CHECK-LABEL: test_v2i32_post_imm_ld1x4:
 ;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %A)
+  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
@@ -1988,19 +1988,19 @@
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld1x4(i32* %A, i32** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i32_post_reg_ld1x4:
 ;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %A)
+  %ld1x4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld1x4
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*)
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v2i64_post_imm_ld1x4:
 ;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %A)
+  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 8
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
@@ -2009,19 +2009,19 @@
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2i64_post_reg_ld1x4:
 ;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %A)
+  %ld1x4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld1x4
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*)
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld1x4(i64* %A, i64** %ptr) {
 ;CHECK-LABEL: test_v1i64_post_imm_ld1x4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %A)
+  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
@@ -2030,19 +2030,19 @@
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld1x4(i64* %A, i64** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1i64_post_reg_ld1x4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %A)
+  %ld1x4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld1x4
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*)
 
 
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld1x4(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v4f32_post_imm_ld1x4:
 ;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %A)
+  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 16
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
@@ -2051,19 +2051,19 @@
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v4f32_post_reg_ld1x4:
 ;CHECK: ld1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %A)
+  %ld1x4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld1x4
 }
 
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*)
 
 
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld1x4(float* %A, float** %ptr) {
 ;CHECK-LABEL: test_v2f32_post_imm_ld1x4:
 ;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %A)
+  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 8
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
@@ -2072,19 +2072,19 @@
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld1x4(float* %A, float** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f32_post_reg_ld1x4:
 ;CHECK: ld1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %A)
+  %ld1x4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld1x4
 }
 
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*)
 
 
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld1x4(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v2f64_post_imm_ld1x4:
 ;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], #64
-  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %A)
+  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 8
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
@@ -2093,19 +2093,19 @@
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v2f64_post_reg_ld1x4:
 ;CHECK: ld1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %A)
+  %ld1x4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld1x4
 }
 
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*)
 
 
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld1x4(double* %A, double** %ptr) {
 ;CHECK-LABEL: test_v1f64_post_imm_ld1x4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %A)
+  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
@@ -2114,19 +2114,19 @@
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld1x4(double* %A, double** %ptr, i64 %inc) {
 ;CHECK-LABEL: test_v1f64_post_reg_ld1x4:
 ;CHECK: ld1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %A)
+  %ld1x4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld1x4
 }
 
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*)
 
 
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld2r:
 ;CHECK: ld2r.16b { v0, v1 }, [x0], #2
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
@@ -2135,19 +2135,19 @@
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld2r:
 ;CHECK: ld2r.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld2r:
 ;CHECK: ld2r.8b { v0, v1 }, [x0], #2
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
@@ -2156,19 +2156,19 @@
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld2r:
 ;CHECK: ld2r.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld2r:
 ;CHECK: ld2r.8h { v0, v1 }, [x0], #4
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
@@ -2177,19 +2177,19 @@
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld2r:
 ;CHECK: ld2r.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
 }
 
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld2r:
 ;CHECK: ld2r.4h { v0, v1 }, [x0], #4
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
@@ -2198,19 +2198,19 @@
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld2r:
 ;CHECK: ld2r.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
 }
 
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld2r:
 ;CHECK: ld2r.4s { v0, v1 }, [x0], #8
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
@@ -2219,18 +2219,18 @@
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld2r:
 ;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
 }
 
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
 
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld2r:
 ;CHECK: ld2r.2s { v0, v1 }, [x0], #8
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
@@ -2239,19 +2239,19 @@
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld2r:
 ;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld2r:
 ;CHECK: ld2r.2d { v0, v1 }, [x0], #16
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
@@ -2260,18 +2260,18 @@
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld2r:
 ;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
 }
 
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
 
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld2r:
 ;CHECK: ld2r.1d { v0, v1 }, [x0], #16
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
@@ -2280,19 +2280,19 @@
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld2r:
 ;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
 }
 
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld2r:
 ;CHECK: ld2r.4s { v0, v1 }, [x0], #8
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float* %A)
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 2
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
@@ -2301,18 +2301,18 @@
 define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld2r:
 ;CHECK: ld2r.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float* %A)
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
 }
 
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float*) nounwind readonly
 
 define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld2r:
 ;CHECK: ld2r.2s { v0, v1 }, [x0], #8
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float* %A)
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 2
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
@@ -2321,19 +2321,19 @@
 define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld2r:
 ;CHECK: ld2r.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float* %A)
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
 }
 
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld2r:
 ;CHECK: ld2r.2d { v0, v1 }, [x0], #16
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double* %A)
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
@@ -2342,18 +2342,18 @@
 define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld2r:
 ;CHECK: ld2r.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double* %A)
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
 }
 
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double*) nounwind readonly
 
 define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld2r:
 ;CHECK: ld2r.1d { v0, v1 }, [x0], #16
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double* %A)
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
@@ -2362,19 +2362,19 @@
 define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld2r:
 ;CHECK: ld2r.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double* %A)
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
 }
 
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double*) nounwind readonly
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld3r:
 ;CHECK: ld3r.16b { v0, v1, v2 }, [x0], #3
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
@@ -2383,19 +2383,19 @@
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld3r:
 ;CHECK: ld3r.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld3r:
 ;CHECK: ld3r.8b { v0, v1, v2 }, [x0], #3
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
@@ -2404,19 +2404,19 @@
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld3r:
 ;CHECK: ld3r.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld3r:
 ;CHECK: ld3r.8h { v0, v1, v2 }, [x0], #6
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
@@ -2425,19 +2425,19 @@
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld3r:
 ;CHECK: ld3r.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld3r:
 ;CHECK: ld3r.4h { v0, v1, v2 }, [x0], #6
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
@@ -2446,19 +2446,19 @@
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld3r:
 ;CHECK: ld3r.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld3r:
 ;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
@@ -2467,18 +2467,18 @@
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld3r:
 ;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
 
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld3r:
 ;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
@@ -2487,19 +2487,19 @@
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld3r:
 ;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld3r:
 ;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
@@ -2508,18 +2508,18 @@
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld3r:
 ;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
 
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld3r:
 ;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
@@ -2528,19 +2528,19 @@
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld3r:
 ;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld3r:
 ;CHECK: ld3r.4s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float* %A)
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 3
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
@@ -2549,18 +2549,18 @@
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld3r:
 ;CHECK: ld3r.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float* %A)
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
 }
 
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float*) nounwind readonly
 
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld3r:
 ;CHECK: ld3r.2s { v0, v1, v2 }, [x0], #12
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float* %A)
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 3
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
@@ -2569,19 +2569,19 @@
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld3r:
 ;CHECK: ld3r.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float* %A)
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
 }
 
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld3r:
 ;CHECK: ld3r.2d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double* %A)
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
@@ -2590,18 +2590,18 @@
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld3r:
 ;CHECK: ld3r.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double* %A)
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
 }
 
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double*) nounwind readonly
 
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld3r:
 ;CHECK: ld3r.1d { v0, v1, v2 }, [x0], #24
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double* %A)
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
@@ -2610,19 +2610,19 @@
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld3r:
 ;CHECK: ld3r.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double* %A)
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
 }
 
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double*) nounwind readonly
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld4r:
 ;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], #4
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
@@ -2631,19 +2631,19 @@
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld4r:
 ;CHECK: ld4r.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4r(i8* %A, i8** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld4r:
 ;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], #4
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
@@ -2652,19 +2652,19 @@
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4r(i8* %A, i8** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld4r:
 ;CHECK: ld4r.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld4r:
 ;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], #8
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
@@ -2673,19 +2673,19 @@
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld4r:
 ;CHECK: ld4r.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4r(i16* %A, i16** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld4r:
 ;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], #8
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
@@ -2694,19 +2694,19 @@
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4r(i16* %A, i16** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld4r:
 ;CHECK: ld4r.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld4r:
 ;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
@@ -2715,18 +2715,18 @@
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld4r:
 ;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
 
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4r(i32* %A, i32** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld4r:
 ;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
@@ -2735,19 +2735,19 @@
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4r(i32* %A, i32** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld4r:
 ;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld4r:
 ;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
@@ -2756,18 +2756,18 @@
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld4r:
 ;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
 
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4r(i64* %A, i64** %ptr) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld4r:
 ;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
@@ -2776,19 +2776,19 @@
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4r(i64* %A, i64** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld4r:
 ;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld4r:
 ;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float* %A)
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
@@ -2797,18 +2797,18 @@
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld4r:
 ;CHECK: ld4r.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float* %A)
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
 }
 
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float*) nounwind readonly
 
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4r(float* %A, float** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld4r:
 ;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], #16
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float* %A)
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
@@ -2817,19 +2817,19 @@
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4r(float* %A, float** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld4r:
 ;CHECK: ld4r.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float* %A)
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
 }
 
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld4r:
 ;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double* %A)
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
@@ -2838,18 +2838,18 @@
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld4r:
 ;CHECK: ld4r.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double* %A)
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
 }
 
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double*) nounwind readonly
 
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4r(double* %A, double** %ptr) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld4r:
 ;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], #32
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double* %A)
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
@@ -2858,19 +2858,19 @@
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4r(double* %A, double** %ptr, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld4r:
 ;CHECK: ld4r.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double* %A)
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
 }
 
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double*) nounwind readonly
 
 
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld2lane:
 ;CHECK: ld2.b { v0, v1 }[0], [x0], #2
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
@@ -2879,19 +2879,19 @@
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld2lane:
 ;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  %ld2 = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld2lane:
 ;CHECK: ld2.b { v0, v1 }[0], [x0], #2
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
@@ -2900,19 +2900,19 @@
 define { <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld2lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld2lane:
 ;CHECK: ld2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  %ld2 = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8> } %ld2
 }
 
-declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld2lane:
 ;CHECK: ld2.h { v0, v1 }[0], [x0], #4
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
@@ -2921,19 +2921,19 @@
 define { <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld2lane:
 ;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  %ld2 = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16> } %ld2
 }
 
-declare { <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld2lane:
 ;CHECK: ld2.h { v0, v1 }[0], [x0], #4
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
@@ -2942,19 +2942,19 @@
 define { <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld2lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld2lane:
 ;CHECK: ld2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  %ld2 = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16> } %ld2
 }
 
-declare { <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
@@ -2963,19 +2963,19 @@
 define { <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  %ld2 = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32> } %ld2
 }
 
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
@@ -2984,19 +2984,19 @@
 define { <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld2lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  %ld2 = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32> } %ld2
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
@@ -3005,19 +3005,19 @@
 define { <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  %ld2 = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64> } %ld2
 }
 
-declare { <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
 
 
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 2
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
@@ -3026,19 +3026,19 @@
 define { <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld2lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  %ld2 = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64> } %ld2
 }
 
-declare { <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float> } @test_v4f32_post_imm_ld2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 2
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
@@ -3047,19 +3047,19 @@
 define { <4 x float>, <4 x float> } @test_v4f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  %ld2 = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float> } %ld2
 }
 
-declare { <4 x float>, <4 x float> } @llvm.arm64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly
+declare { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x float>, <2 x float> } @test_v2f32_post_imm_ld2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], #8
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 2
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
@@ -3068,19 +3068,19 @@
 define { <2 x float>, <2 x float> } @test_v2f32_post_reg_ld2lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld2lane:
 ;CHECK: ld2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  %ld2 = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float> } %ld2
 }
 
-declare { <2 x float>, <2 x float> } @llvm.arm64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly
+declare { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double> } @test_v2f64_post_imm_ld2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
@@ -3089,19 +3089,19 @@
 define { <2 x double>, <2 x double> } @test_v2f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  %ld2 = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double> } %ld2
 }
 
-declare { <2 x double>, <2 x double> } @llvm.arm64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly
+declare { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*) nounwind readonly
 
 
 define { <1 x double>, <1 x double> } @test_v1f64_post_imm_ld2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], #16
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 2
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
@@ -3110,19 +3110,19 @@
 define { <1 x double>, <1 x double> } @test_v1f64_post_reg_ld2lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld2lane:
 ;CHECK: ld2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  %ld2 = call { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  %ld2 = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double> } %ld2
 }
 
-declare { <1 x double>, <1 x double> } @llvm.arm64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly
+declare { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*) nounwind readonly
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld3lane:
 ;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
@@ -3131,19 +3131,19 @@
 define { <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld3lane:
 ;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  %ld3 = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8> } %ld3
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld3lane:
 ;CHECK: ld3.b { v0, v1, v2 }[0], [x0], #3
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
@@ -3152,19 +3152,19 @@
 define { <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld3lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld3lane:
 ;CHECK: ld3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  %ld3 = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8> } %ld3
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld3lane:
 ;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
@@ -3173,19 +3173,19 @@
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld3lane:
 ;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  %ld3 = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16> } %ld3
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld3lane:
 ;CHECK: ld3.h { v0, v1, v2 }[0], [x0], #6
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
@@ -3194,19 +3194,19 @@
 define { <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld3lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld3lane:
 ;CHECK: ld3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  %ld3 = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16> } %ld3
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
@@ -3215,19 +3215,19 @@
 define { <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  %ld3 = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32> } %ld3
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
@@ -3236,19 +3236,19 @@
 define { <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld3lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  %ld3 = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32> } %ld3
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
@@ -3257,19 +3257,19 @@
 define { <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  %ld3 = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64> } %ld3
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 3
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
@@ -3278,19 +3278,19 @@
 define { <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld3lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  %ld3 = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64> } %ld3
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 3
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
@@ -3299,19 +3299,19 @@
 define { <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  %ld3 = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float> } %ld3
 }
 
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], #12
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 3
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
@@ -3320,19 +3320,19 @@
 define { <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld3lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld3lane:
 ;CHECK: ld3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  %ld3 = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float> } %ld3
 }
 
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
@@ -3341,19 +3341,19 @@
 define { <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  %ld3 = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double> } %ld3
 }
 
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
 
 
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], #24
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 3
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
@@ -3362,19 +3362,19 @@
 define { <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld3lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld3lane:
 ;CHECK: ld3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  %ld3 = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double> } %ld3
 }
 
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
 
 
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_ld4lane:
 ;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
@@ -3383,19 +3383,19 @@
 define { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @test_v16i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_ld4lane:
 ;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  %ld4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %ld4
 }
 
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_imm_ld4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_ld4lane:
 ;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], #4
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
@@ -3404,19 +3404,19 @@
 define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @test_v8i8_post_reg_ld4lane(i8* %A, i8** %ptr, i64 %inc, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_ld4lane:
 ;CHECK: ld4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  %ld4 = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   store i8* %tmp, i8** %ptr
   ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %ld4
 }
 
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*) nounwind readonly
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_imm_ld4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_ld4lane:
 ;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
@@ -3425,19 +3425,19 @@
 define { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @test_v8i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_ld4lane:
 ;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  %ld4 = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %ld4
 }
 
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_imm_ld4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_ld4lane:
 ;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], #8
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
@@ -3446,19 +3446,19 @@
 define { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @test_v4i16_post_reg_ld4lane(i16* %A, i16** %ptr, i64 %inc, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_ld4lane:
 ;CHECK: ld4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  %ld4 = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   store i16* %tmp, i16** %ptr
   ret { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %ld4
 }
 
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*) nounwind readonly
 
 
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_imm_ld4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
@@ -3467,19 +3467,19 @@
 define { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @test_v4i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  %ld4 = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %ld4
 }
 
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_imm_ld4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
@@ -3488,19 +3488,19 @@
 define { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @test_v2i32_post_reg_ld4lane(i32* %A, i32** %ptr, i64 %inc, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  %ld4 = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   store i32* %tmp, i32** %ptr
   ret { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %ld4
 }
 
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*) nounwind readonly
 
 
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_imm_ld4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
@@ -3509,19 +3509,19 @@
 define { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @test_v2i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  %ld4 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %ld4
 }
 
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
 
 
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_imm_ld4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i32 4
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
@@ -3530,19 +3530,19 @@
 define { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @test_v1i64_post_reg_ld4lane(i64* %A, i64** %ptr, i64 %inc, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  %ld4 = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   store i64* %tmp, i64** %ptr
   ret { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %ld4
 }
 
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) nounwind readonly
 
 
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_imm_ld4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
@@ -3551,19 +3551,19 @@
 define { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @test_v4f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  %ld4 = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %ld4
 }
 
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_imm_ld4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], #16
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 4
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
@@ -3572,19 +3572,19 @@
 define { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @test_v2f32_post_reg_ld4lane(float* %A, float** %ptr, i64 %inc, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_ld4lane:
 ;CHECK: ld4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  %ld4 = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   store float* %tmp, float** %ptr
   ret { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %ld4
 }
 
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*) nounwind readonly
 
 
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_imm_ld4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
@@ -3593,19 +3593,19 @@
 define { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @test_v2f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  %ld4 = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %ld4
 }
 
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*) nounwind readonly
 
 
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_imm_ld4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], #32
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i32 4
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
@@ -3614,19 +3614,19 @@
 define { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @test_v1f64_post_reg_ld4lane(double* %A, double** %ptr, i64 %inc, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_ld4lane:
 ;CHECK: ld4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  %ld4 = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   store double* %tmp, double** %ptr
   ret { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %ld4
 }
 
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*) nounwind readonly
 
 
 define i8* @test_v16i8_post_imm_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st2:
 ;CHECK: st2.16b { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   ret i8* %tmp
 }
@@ -3634,18 +3634,18 @@
 define i8* @test_v16i8_post_reg_st2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st2:
 ;CHECK: st2.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st2:
 ;CHECK: st2.8b { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i32 16
   ret i8* %tmp
 }
@@ -3653,18 +3653,18 @@
 define i8* @test_v8i8_post_reg_st2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st2:
 ;CHECK: st2.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st2:
 ;CHECK: st2.8h { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   ret i16* %tmp
 }
@@ -3672,18 +3672,18 @@
 define i16* @test_v8i16_post_reg_st2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st2:
 ;CHECK: st2.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st2:
 ;CHECK: st2.4h { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i32 8
   ret i16* %tmp
 }
@@ -3691,18 +3691,18 @@
 define i16* @test_v4i16_post_reg_st2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st2:
 ;CHECK: st2.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
 
 
 define i32* @test_v4i32_post_imm_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st2:
 ;CHECK: st2.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   ret i32* %tmp
 }
@@ -3710,18 +3710,18 @@
 define i32* @test_v4i32_post_reg_st2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st2:
 ;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
 
 
 define i32* @test_v2i32_post_imm_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st2:
 ;CHECK: st2.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   ret i32* %tmp
 }
@@ -3729,18 +3729,18 @@
 define i32* @test_v2i32_post_reg_st2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st2:
 ;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st2:
 ;CHECK: st2.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -3748,18 +3748,18 @@
 define i64* @test_v2i64_post_reg_st2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st2:
 ;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
 
 
 define i64* @test_v1i64_post_imm_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st2:
 ;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 2
   ret i64* %tmp
 }
@@ -3767,18 +3767,18 @@
 define i64* @test_v1i64_post_reg_st2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st2:
 ;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
 
 
 define float* @test_v4f32_post_imm_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st2:
 ;CHECK: st2.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i32 8
   ret float* %tmp
 }
@@ -3786,18 +3786,18 @@
 define float* @test_v4f32_post_reg_st2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st2:
 ;CHECK: st2.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st2:
 ;CHECK: st2.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i32 4
   ret float* %tmp
 }
@@ -3805,18 +3805,18 @@
 define float* @test_v2f32_post_reg_st2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st2:
 ;CHECK: st2.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st2:
 ;CHECK: st2.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -3824,18 +3824,18 @@
 define double* @test_v2f64_post_reg_st2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st2:
 ;CHECK: st2.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+declare void @llvm.aarch64.neon.st2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
 
 
 define double* @test_v1f64_post_imm_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st2:
 ;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 2
   ret double* %tmp
 }
@@ -3843,18 +3843,18 @@
 define double* @test_v1f64_post_reg_st2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st2:
 ;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st3:
 ;CHECK: st3.16b { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i32 48
   ret i8* %tmp
 }
@@ -3862,18 +3862,18 @@
 define i8* @test_v16i8_post_reg_st3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st3:
 ;CHECK: st3.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st3:
 ;CHECK: st3.8b { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i32 24
   ret i8* %tmp
 }
@@ -3881,18 +3881,18 @@
 define i8* @test_v8i8_post_reg_st3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st3:
 ;CHECK: st3.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st3:
 ;CHECK: st3.8h { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i32 24
   ret i16* %tmp
 }
@@ -3900,18 +3900,18 @@
 define i16* @test_v8i16_post_reg_st3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st3:
 ;CHECK: st3.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st3:
 ;CHECK: st3.4h { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i32 12
   ret i16* %tmp
 }
@@ -3919,18 +3919,18 @@
 define i16* @test_v4i16_post_reg_st3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st3:
 ;CHECK: st3.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
 
 
 define i32* @test_v4i32_post_imm_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st3:
 ;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i32 12
   ret i32* %tmp
 }
@@ -3938,18 +3938,18 @@
 define i32* @test_v4i32_post_reg_st3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st3:
 ;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
 
 
 define i32* @test_v2i32_post_imm_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st3:
 ;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i32 6
   ret i32* %tmp
 }
@@ -3957,18 +3957,18 @@
 define i32* @test_v2i32_post_reg_st3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st3:
 ;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st3:
 ;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 6
   ret i64* %tmp
 }
@@ -3976,18 +3976,18 @@
 define i64* @test_v2i64_post_reg_st3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st3:
 ;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
 
 
 define i64* @test_v1i64_post_imm_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 3
   ret i64* %tmp
 }
@@ -3995,18 +3995,18 @@
 define i64* @test_v1i64_post_reg_st3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
 
 
 define float* @test_v4f32_post_imm_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st3:
 ;CHECK: st3.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i32 12
   ret float* %tmp
 }
@@ -4014,18 +4014,18 @@
 define float* @test_v4f32_post_reg_st3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st3:
 ;CHECK: st3.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st3:
 ;CHECK: st3.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i32 6
   ret float* %tmp
 }
@@ -4033,18 +4033,18 @@
 define float* @test_v2f32_post_reg_st3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st3:
 ;CHECK: st3.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st3:
 ;CHECK: st3.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 6
   ret double* %tmp
 }
@@ -4052,18 +4052,18 @@
 define double* @test_v2f64_post_reg_st3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st3:
 ;CHECK: st3.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+declare void @llvm.aarch64.neon.st3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
 
 
 define double* @test_v1f64_post_imm_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 3
   ret double* %tmp
 }
@@ -4071,18 +4071,18 @@
 define double* @test_v1f64_post_reg_st3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st4:
 ;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i32 64
   ret i8* %tmp
 }
@@ -4090,18 +4090,18 @@
 define i8* @test_v16i8_post_reg_st4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st4:
 ;CHECK: st4.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st4:
 ;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   ret i8* %tmp
 }
@@ -4109,18 +4109,18 @@
 define i8* @test_v8i8_post_reg_st4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st4:
 ;CHECK: st4.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st4:
 ;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i32 32
   ret i16* %tmp
 }
@@ -4128,18 +4128,18 @@
 define i16* @test_v8i16_post_reg_st4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st4:
 ;CHECK: st4.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st4:
 ;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   ret i16* %tmp
 }
@@ -4147,18 +4147,18 @@
 define i16* @test_v4i16_post_reg_st4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st4:
 ;CHECK: st4.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
 
 
 define i32* @test_v4i32_post_imm_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st4:
 ;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i32 16
   ret i32* %tmp
 }
@@ -4166,18 +4166,18 @@
 define i32* @test_v4i32_post_reg_st4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st4:
 ;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
 
 
 define i32* @test_v2i32_post_imm_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st4:
 ;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   ret i32* %tmp
 }
@@ -4185,18 +4185,18 @@
 define i32* @test_v2i32_post_reg_st4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st4:
 ;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st4:
 ;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 8
   ret i64* %tmp
 }
@@ -4204,18 +4204,18 @@
 define i64* @test_v2i64_post_reg_st4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st4:
 ;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
 
 
 define i64* @test_v1i64_post_imm_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -4223,18 +4223,18 @@
 define i64* @test_v1i64_post_reg_st4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
 
 
 define float* @test_v4f32_post_imm_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st4:
 ;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i32 16
   ret float* %tmp
 }
@@ -4242,18 +4242,18 @@
 define float* @test_v4f32_post_reg_st4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st4:
 ;CHECK: st4.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st4:
 ;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i32 8
   ret float* %tmp
 }
@@ -4261,18 +4261,18 @@
 define float* @test_v2f32_post_reg_st4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st4:
 ;CHECK: st4.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st4:
 ;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 8
   ret double* %tmp
 }
@@ -4280,18 +4280,18 @@
 define double* @test_v2f64_post_reg_st4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st4:
 ;CHECK: st4.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
+declare void @llvm.aarch64.neon.st4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
 
 
 define double* @test_v1f64_post_imm_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -4299,18 +4299,18 @@
 define double* @test_v1f64_post_reg_st4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st1x2:
 ;CHECK: st1.16b { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   ret i8* %tmp
 }
@@ -4318,18 +4318,18 @@
 define i8* @test_v16i8_post_reg_st1x2(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st1x2:
 ;CHECK: st1.16b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st1x2:
 ;CHECK: st1.8b { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i32 16
   ret i8* %tmp
 }
@@ -4337,18 +4337,18 @@
 define i8* @test_v8i8_post_reg_st1x2(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st1x2:
 ;CHECK: st1.8b { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st1x2:
 ;CHECK: st1.8h { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   ret i16* %tmp
 }
@@ -4356,18 +4356,18 @@
 define i16* @test_v8i16_post_reg_st1x2(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st1x2:
 ;CHECK: st1.8h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st1x2:
 ;CHECK: st1.4h { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i32 8
   ret i16* %tmp
 }
@@ -4375,18 +4375,18 @@
 define i16* @test_v4i16_post_reg_st1x2(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st1x2:
 ;CHECK: st1.4h { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*)
 
 
 define i32* @test_v4i32_post_imm_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st1x2:
 ;CHECK: st1.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   ret i32* %tmp
 }
@@ -4394,18 +4394,18 @@
 define i32* @test_v4i32_post_reg_st1x2(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st1x2:
 ;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*)
 
 
 define i32* @test_v2i32_post_imm_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st1x2:
 ;CHECK: st1.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   ret i32* %tmp
 }
@@ -4413,18 +4413,18 @@
 define i32* @test_v2i32_post_reg_st1x2(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st1x2:
 ;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st1x2:
 ;CHECK: st1.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -4432,18 +4432,18 @@
 define i64* @test_v2i64_post_reg_st1x2(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st1x2:
 ;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*)
 
 
 define i64* @test_v1i64_post_imm_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st1x2:
 ;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 2
   ret i64* %tmp
 }
@@ -4451,18 +4451,18 @@
 define i64* @test_v1i64_post_reg_st1x2(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st1x2:
 ;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*)
 
 
 define float* @test_v4f32_post_imm_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st1x2:
 ;CHECK: st1.4s { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i32 8
   ret float* %tmp
 }
@@ -4470,18 +4470,18 @@
 define float* @test_v4f32_post_reg_st1x2(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st1x2:
 ;CHECK: st1.4s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %B, <4 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st1x2:
 ;CHECK: st1.2s { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i32 4
   ret float* %tmp
 }
@@ -4489,18 +4489,18 @@
 define float* @test_v2f32_post_reg_st1x2(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st1x2:
 ;CHECK: st1.2s { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %B, <2 x float> %C, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st1x2:
 ;CHECK: st1.2d { v0, v1 }, [x0], #32
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -4508,18 +4508,18 @@
 define double* @test_v2f64_post_reg_st1x2(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st1x2:
 ;CHECK: st1.2d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %B, <2 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*)
 
 
 define double* @test_v1f64_post_imm_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st1x2:
 ;CHECK: st1.1d { v0, v1 }, [x0], #16
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 2
   ret double* %tmp
 }
@@ -4527,18 +4527,18 @@
 define double* @test_v1f64_post_reg_st1x2(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st1x2:
 ;CHECK: st1.1d { v0, v1 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %B, <1 x double> %C, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st1x3:
 ;CHECK: st1.16b { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i32 48
   ret i8* %tmp
 }
@@ -4546,18 +4546,18 @@
 define i8* @test_v16i8_post_reg_st1x3(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st1x3:
 ;CHECK: st1.16b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st1x3:
 ;CHECK: st1.8b { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i32 24
   ret i8* %tmp
 }
@@ -4565,18 +4565,18 @@
 define i8* @test_v8i8_post_reg_st1x3(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st1x3:
 ;CHECK: st1.8b { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st1x3:
 ;CHECK: st1.8h { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i32 24
   ret i16* %tmp
 }
@@ -4584,18 +4584,18 @@
 define i16* @test_v8i16_post_reg_st1x3(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st1x3:
 ;CHECK: st1.8h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st1x3:
 ;CHECK: st1.4h { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i32 12
   ret i16* %tmp
 }
@@ -4603,18 +4603,18 @@
 define i16* @test_v4i16_post_reg_st1x3(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st1x3:
 ;CHECK: st1.4h { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*)
 
 
 define i32* @test_v4i32_post_imm_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st1x3:
 ;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i32 12
   ret i32* %tmp
 }
@@ -4622,18 +4622,18 @@
 define i32* @test_v4i32_post_reg_st1x3(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st1x3:
 ;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*)
 
 
 define i32* @test_v2i32_post_imm_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st1x3:
 ;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i32 6
   ret i32* %tmp
 }
@@ -4641,18 +4641,18 @@
 define i32* @test_v2i32_post_reg_st1x3(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st1x3:
 ;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st1x3:
 ;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 6
   ret i64* %tmp
 }
@@ -4660,18 +4660,18 @@
 define i64* @test_v2i64_post_reg_st1x3(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st1x3:
 ;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*)
 
 
 define i64* @test_v1i64_post_imm_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st1x3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 3
   ret i64* %tmp
 }
@@ -4679,18 +4679,18 @@
 define i64* @test_v1i64_post_reg_st1x3(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st1x3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*)
 
 
 define float* @test_v4f32_post_imm_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st1x3:
 ;CHECK: st1.4s { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i32 12
   ret float* %tmp
 }
@@ -4698,18 +4698,18 @@
 define float* @test_v4f32_post_reg_st1x3(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st1x3:
 ;CHECK: st1.4s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st1x3:
 ;CHECK: st1.2s { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i32 6
   ret float* %tmp
 }
@@ -4717,18 +4717,18 @@
 define float* @test_v2f32_post_reg_st1x3(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st1x3:
 ;CHECK: st1.2s { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st1x3:
 ;CHECK: st1.2d { v0, v1, v2 }, [x0], #48
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 6
   ret double* %tmp
 }
@@ -4736,18 +4736,18 @@
 define double* @test_v2f64_post_reg_st1x3(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st1x3:
 ;CHECK: st1.2d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*)
 
 
 define double* @test_v1f64_post_imm_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st1x3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], #24
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 3
   ret double* %tmp
 }
@@ -4755,18 +4755,18 @@
 define double* @test_v1f64_post_reg_st1x3(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st1x3:
 ;CHECK: st1.1d { v0, v1, v2 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st1x4:
 ;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i32 64
   ret i8* %tmp
 }
@@ -4774,18 +4774,18 @@
 define i8* @test_v16i8_post_reg_st1x4(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st1x4:
 ;CHECK: st1.16b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
 
 
 define i8* @test_v8i8_post_imm_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st1x4:
 ;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   ret i8* %tmp
 }
@@ -4793,18 +4793,18 @@
 define i8* @test_v8i8_post_reg_st1x4(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st1x4:
 ;CHECK: st1.8b { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*)
 
 
 define i16* @test_v8i16_post_imm_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st1x4:
 ;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i32 32
   ret i16* %tmp
 }
@@ -4812,18 +4812,18 @@
 define i16* @test_v8i16_post_reg_st1x4(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st1x4:
 ;CHECK: st1.8h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*)
 
 
 define i16* @test_v4i16_post_imm_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st1x4:
 ;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i32 16
   ret i16* %tmp
 }
@@ -4831,18 +4831,18 @@
 define i16* @test_v4i16_post_reg_st1x4(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st1x4:
 ;CHECK: st1.4h { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>,<4 x i16>,  i16*)
 
 
 define i32* @test_v4i32_post_imm_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st1x4:
 ;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i32 16
   ret i32* %tmp
 }
@@ -4850,18 +4850,18 @@
 define i32* @test_v4i32_post_reg_st1x4(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st1x4:
 ;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>,<4 x i32>,  i32*)
 
 
 define i32* @test_v2i32_post_imm_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st1x4:
 ;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i32 8
   ret i32* %tmp
 }
@@ -4869,18 +4869,18 @@
 define i32* @test_v2i32_post_reg_st1x4(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st1x4:
 ;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*)
 
 
 define i64* @test_v2i64_post_imm_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st1x4:
 ;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 8
   ret i64* %tmp
 }
@@ -4888,18 +4888,18 @@
 define i64* @test_v2i64_post_reg_st1x4(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st1x4:
 ;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>,<2 x i64>,  i64*)
 
 
 define i64* @test_v1i64_post_imm_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st1x4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -4907,18 +4907,18 @@
 define i64* @test_v1i64_post_reg_st1x4(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st1x4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>,<1 x i64>,  i64*)
 
 
 define float* @test_v4f32_post_imm_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st1x4:
 ;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i32 16
   ret float* %tmp
 }
@@ -4926,18 +4926,18 @@
 define float* @test_v4f32_post_reg_st1x4(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st1x4:
 ;CHECK: st1.4s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*)
 
 
 define float* @test_v2f32_post_imm_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st1x4:
 ;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i32 8
   ret float* %tmp
 }
@@ -4945,18 +4945,18 @@
 define float* @test_v2f32_post_reg_st1x4(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st1x4:
 ;CHECK: st1.2s { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*)
 
 
 define double* @test_v2f64_post_imm_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st1x4:
 ;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], #64
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 8
   ret double* %tmp
 }
@@ -4964,18 +4964,18 @@
 define double* @test_v2f64_post_reg_st1x4(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st1x4:
 ;CHECK: st1.2d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>,<2 x double>,  double*)
 
 
 define double* @test_v1f64_post_imm_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st1x4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], #32
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -4983,33 +4983,33 @@
 define double* @test_v1f64_post_reg_st1x4(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st1x4:
 ;CHECK: st1.1d { v0, v1, v2, v3 }, [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*)
 
 
 define i8* @test_v16i8_post_imm_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) {
-  call void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+  call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   ret i8* %tmp
 }
 
 define i8* @test_v16i8_post_reg_st2lanelane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) {
-  call void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
+  call void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i64 1, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lanelane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i64, i8*) nounwind readnone
 
 
 define i8* @test_v16i8_post_imm_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st2lane:
 ;CHECK: st2.b { v0, v1 }[0], [x0], #2
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   ret i8* %tmp
 }
@@ -5017,18 +5017,18 @@
 define i8* @test_v16i8_post_reg_st2lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st2lane:
 ;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*)
 
 
 define i8* @test_v8i8_post_imm_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st2lane:
 ;CHECK: st2.b { v0, v1 }[0], [x0], #2
-  call void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 2
   ret i8* %tmp
 }
@@ -5036,18 +5036,18 @@
 define i8* @test_v8i8_post_reg_st2lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st2lane:
 ;CHECK: st2.b { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8>, <8 x i8>, i64, i8*)
 
 
 define i16* @test_v8i16_post_imm_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st2lane:
 ;CHECK: st2.h { v0, v1 }[0], [x0], #4
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   ret i16* %tmp
 }
@@ -5055,18 +5055,18 @@
 define i16* @test_v8i16_post_reg_st2lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st2lane:
 ;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*)
 
 
 define i16* @test_v4i16_post_imm_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st2lane:
 ;CHECK: st2.h { v0, v1 }[0], [x0], #4
-  call void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 2
   ret i16* %tmp
 }
@@ -5074,18 +5074,18 @@
 define i16* @test_v4i16_post_reg_st2lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st2lane:
 ;CHECK: st2.h { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st2lane.v4i16.p0i16(<4 x i16>, <4 x i16>, i64, i16*)
 
 
 define i32* @test_v4i32_post_imm_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   ret i32* %tmp
 }
@@ -5093,18 +5093,18 @@
 define i32* @test_v4i32_post_reg_st2lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
 
 
 define i32* @test_v2i32_post_imm_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 2
   ret i32* %tmp
 }
@@ -5112,18 +5112,18 @@
 define i32* @test_v2i32_post_reg_st2lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st2lane.v2i32.p0i32(<2 x i32>, <2 x i32>, i64, i32*)
 
 
 define i64* @test_v2i64_post_imm_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 2
   ret i64* %tmp
 }
@@ -5131,18 +5131,18 @@
 define i64* @test_v2i64_post_reg_st2lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*)
 
 
 define i64* @test_v1i64_post_imm_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 2
   ret i64* %tmp
 }
@@ -5150,18 +5150,18 @@
 define i64* @test_v1i64_post_reg_st2lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
 
 
 define float* @test_v4f32_post_imm_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 2
   ret float* %tmp
 }
@@ -5169,18 +5169,18 @@
 define float* @test_v4f32_post_reg_st2lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st2lane.v4f32.p0f32(<4 x float>, <4 x float>, i64, float*)
 
 
 define float* @test_v2f32_post_imm_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], #8
-  call void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 2
   ret float* %tmp
 }
@@ -5188,18 +5188,18 @@
 define float* @test_v2f32_post_reg_st2lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st2lane:
 ;CHECK: st2.s { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st2lane.v2f32.p0f32(<2 x float>, <2 x float>, i64, float*)
 
 
 define double* @test_v2f64_post_imm_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 2
   ret double* %tmp
 }
@@ -5207,18 +5207,18 @@
 define double* @test_v2f64_post_reg_st2lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st2lane.v2f64.p0f64(<2 x double>, <2 x double>, i64, double*)
 
 
 define double* @test_v1f64_post_imm_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], #16
-  call void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 2
   ret double* %tmp
 }
@@ -5226,18 +5226,18 @@
 define double* @test_v1f64_post_reg_st2lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st2lane:
 ;CHECK: st2.d { v0, v1 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st2lane.v1f64.p0f64(<1 x double>, <1 x double>, i64, double*)
 
 
 define i8* @test_v16i8_post_imm_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st3lane:
 ;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   ret i8* %tmp
 }
@@ -5245,18 +5245,18 @@
 define i8* @test_v16i8_post_reg_st3lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st3lane:
 ;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
 
 
 define i8* @test_v8i8_post_imm_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st3lane:
 ;CHECK: st3.b { v0, v1, v2 }[0], [x0], #3
-  call void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 3
   ret i8* %tmp
 }
@@ -5264,18 +5264,18 @@
 define i8* @test_v8i8_post_reg_st3lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st3lane:
 ;CHECK: st3.b { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
 
 
 define i16* @test_v8i16_post_imm_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st3lane:
 ;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   ret i16* %tmp
 }
@@ -5283,18 +5283,18 @@
 define i16* @test_v8i16_post_reg_st3lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st3lane:
 ;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
 
 
 define i16* @test_v4i16_post_imm_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st3lane:
 ;CHECK: st3.h { v0, v1, v2 }[0], [x0], #6
-  call void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 3
   ret i16* %tmp
 }
@@ -5302,18 +5302,18 @@
 define i16* @test_v4i16_post_reg_st3lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st3lane:
 ;CHECK: st3.h { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st3lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
 
 
 define i32* @test_v4i32_post_imm_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   ret i32* %tmp
 }
@@ -5321,18 +5321,18 @@
 define i32* @test_v4i32_post_reg_st3lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
 
 
 define i32* @test_v2i32_post_imm_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 3
   ret i32* %tmp
 }
@@ -5340,18 +5340,18 @@
 define i32* @test_v2i32_post_reg_st3lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st3lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
 
 
 define i64* @test_v2i64_post_imm_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 3
   ret i64* %tmp
 }
@@ -5359,18 +5359,18 @@
 define i64* @test_v2i64_post_reg_st3lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
 
 
 define i64* @test_v1i64_post_imm_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 3
   ret i64* %tmp
 }
@@ -5378,18 +5378,18 @@
 define i64* @test_v1i64_post_reg_st3lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
 
 
 define float* @test_v4f32_post_imm_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 3
   ret float* %tmp
 }
@@ -5397,18 +5397,18 @@
 define float* @test_v4f32_post_reg_st3lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st3lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, i64, float*)
 
 
 define float* @test_v2f32_post_imm_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], #12
-  call void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 3
   ret float* %tmp
 }
@@ -5416,18 +5416,18 @@
 define float* @test_v2f32_post_reg_st3lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st3lane:
 ;CHECK: st3.s { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st3lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, i64, float*)
 
 
 define double* @test_v2f64_post_imm_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 3
   ret double* %tmp
 }
@@ -5435,18 +5435,18 @@
 define double* @test_v2f64_post_reg_st3lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st3lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, i64, double*)
 
 
 define double* @test_v1f64_post_imm_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], #24
-  call void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 3
   ret double* %tmp
 }
@@ -5454,18 +5454,18 @@
 define double* @test_v1f64_post_reg_st3lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st3lane:
 ;CHECK: st3.d { v0, v1, v2 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st3lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, i64, double*)
 
 
 define i8* @test_v16i8_post_imm_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v16i8_post_imm_st4lane:
 ;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   ret i8* %tmp
 }
@@ -5473,18 +5473,18 @@
 define i8* @test_v16i8_post_reg_st4lane(i8* %A, i8** %ptr, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v16i8_post_reg_st4lane:
 ;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*)
 
 
 define i8* @test_v8i8_post_imm_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E) nounwind {
 ;CHECK-LABEL: test_v8i8_post_imm_st4lane:
 ;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], #4
-  call void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i32 4
   ret i8* %tmp
 }
@@ -5492,18 +5492,18 @@
 define i8* @test_v8i8_post_reg_st4lane(i8* %A, i8** %ptr, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i8_post_reg_st4lane:
 ;CHECK: st4.b { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
+  call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> %B, <8 x i8> %C, <8 x i8> %D, <8 x i8> %E, i64 0, i8* %A)
   %tmp = getelementptr i8* %A, i64 %inc
   ret i8* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
+declare void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i64, i8*)
 
 
 define i16* @test_v8i16_post_imm_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v8i16_post_imm_st4lane:
 ;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   ret i16* %tmp
 }
@@ -5511,18 +5511,18 @@
 define i16* @test_v8i16_post_reg_st4lane(i16* %A, i16** %ptr, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v8i16_post_reg_st4lane:
 ;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %B, <8 x i16> %C, <8 x i16> %D, <8 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*)
 
 
 define i16* @test_v4i16_post_imm_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E) nounwind {
 ;CHECK-LABEL: test_v4i16_post_imm_st4lane:
 ;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], #8
-  call void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i32 4
   ret i16* %tmp
 }
@@ -5530,18 +5530,18 @@
 define i16* @test_v4i16_post_reg_st4lane(i16* %A, i16** %ptr, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i16_post_reg_st4lane:
 ;CHECK: st4.h { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
+  call void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16> %B, <4 x i16> %C, <4 x i16> %D, <4 x i16> %E, i64 0, i16* %A)
   %tmp = getelementptr i16* %A, i64 %inc
   ret i16* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
+declare void @llvm.aarch64.neon.st4lane.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i64, i16*)
 
 
 define i32* @test_v4i32_post_imm_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v4i32_post_imm_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   ret i32* %tmp
 }
@@ -5549,18 +5549,18 @@
 define i32* @test_v4i32_post_reg_st4lane(i32* %A, i32** %ptr, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4i32_post_reg_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %B, <4 x i32> %C, <4 x i32> %D, <4 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
 
 
 define i32* @test_v2i32_post_imm_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E) nounwind {
 ;CHECK-LABEL: test_v2i32_post_imm_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i32 4
   ret i32* %tmp
 }
@@ -5568,18 +5568,18 @@
 define i32* @test_v2i32_post_reg_st4lane(i32* %A, i32** %ptr, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i32_post_reg_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
+  call void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32> %B, <2 x i32> %C, <2 x i32> %D, <2 x i32> %E, i64 0, i32* %A)
   %tmp = getelementptr i32* %A, i64 %inc
   ret i32* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
+declare void @llvm.aarch64.neon.st4lane.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i64, i32*)
 
 
 define i64* @test_v2i64_post_imm_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v2i64_post_imm_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -5587,18 +5587,18 @@
 define i64* @test_v2i64_post_reg_st4lane(i64* %A, i64** %ptr, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2i64_post_reg_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %B, <2 x i64> %C, <2 x i64> %D, <2 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*)
 
 
 define i64* @test_v1i64_post_imm_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E) nounwind {
 ;CHECK-LABEL: test_v1i64_post_imm_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 4
   ret i64* %tmp
 }
@@ -5606,18 +5606,18 @@
 define i64* @test_v1i64_post_reg_st4lane(i64* %A, i64** %ptr, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1i64_post_reg_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
+  call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> %B, <1 x i64> %C, <1 x i64> %D, <1 x i64> %E, i64 0, i64* %A)
   %tmp = getelementptr i64* %A, i64 %inc
   ret i64* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
 
 
 define float* @test_v4f32_post_imm_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E) nounwind {
 ;CHECK-LABEL: test_v4f32_post_imm_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 4
   ret float* %tmp
 }
@@ -5625,18 +5625,18 @@
 define float* @test_v4f32_post_reg_st4lane(float* %A, float** %ptr, <4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v4f32_post_reg_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float> %B, <4 x float> %C, <4 x float> %D, <4 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st4lane.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, i64, float*)
 
 
 define float* @test_v2f32_post_imm_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E) nounwind {
 ;CHECK-LABEL: test_v2f32_post_imm_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], #16
-  call void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i32 4
   ret float* %tmp
 }
@@ -5644,18 +5644,18 @@
 define float* @test_v2f32_post_reg_st4lane(float* %A, float** %ptr, <2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f32_post_reg_st4lane:
 ;CHECK: st4.s { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
+  call void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float> %B, <2 x float> %C, <2 x float> %D, <2 x float> %E, i64 0, float* %A)
   %tmp = getelementptr float* %A, i64 %inc
   ret float* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
+declare void @llvm.aarch64.neon.st4lane.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, i64, float*)
 
 
 define double* @test_v2f64_post_imm_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E) nounwind {
 ;CHECK-LABEL: test_v2f64_post_imm_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -5663,18 +5663,18 @@
 define double* @test_v2f64_post_reg_st4lane(double* %A, double** %ptr, <2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v2f64_post_reg_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double> %B, <2 x double> %C, <2 x double> %D, <2 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st4lane.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, i64, double*)
 
 
 define double* @test_v1f64_post_imm_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E) nounwind {
 ;CHECK-LABEL: test_v1f64_post_imm_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], #32
-  call void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 4
   ret double* %tmp
 }
@@ -5682,12 +5682,12 @@
 define double* @test_v1f64_post_reg_st4lane(double* %A, double** %ptr, <1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 %inc) nounwind {
 ;CHECK-LABEL: test_v1f64_post_reg_st4lane:
 ;CHECK: st4.d { v0, v1, v2, v3 }[0], [x0], x{{[0-9]+}}
-  call void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
+  call void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double> %B, <1 x double> %C, <1 x double> %D, <1 x double> %E, i64 0, double* %A)
   %tmp = getelementptr double* %A, i64 %inc
   ret double* %tmp
 }
 
-declare void @llvm.arm64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*)
+declare void @llvm.aarch64.neon.st4lane.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, double*)
 
 define <16 x i8> @test_v16i8_post_imm_ld1r(i8* %bar, i8** %ptr) {
 ; CHECK-LABEL: test_v16i8_post_imm_ld1r:
diff --git a/llvm/test/CodeGen/ARM64/inline-asm-error-I.ll b/llvm/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/inline-asm-error-I.ll
rename to llvm/test/CodeGen/AArch64/arm64-inline-asm-error-I.ll
diff --git a/llvm/test/CodeGen/ARM64/inline-asm-error-J.ll b/llvm/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/inline-asm-error-J.ll
rename to llvm/test/CodeGen/AArch64/arm64-inline-asm-error-J.ll
diff --git a/llvm/test/CodeGen/ARM64/inline-asm-error-K.ll b/llvm/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/inline-asm-error-K.ll
rename to llvm/test/CodeGen/AArch64/arm64-inline-asm-error-K.ll
diff --git a/llvm/test/CodeGen/ARM64/inline-asm-error-L.ll b/llvm/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/inline-asm-error-L.ll
rename to llvm/test/CodeGen/AArch64/arm64-inline-asm-error-L.ll
diff --git a/llvm/test/CodeGen/ARM64/inline-asm-error-M.ll b/llvm/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/inline-asm-error-M.ll
rename to llvm/test/CodeGen/AArch64/arm64-inline-asm-error-M.ll
diff --git a/llvm/test/CodeGen/ARM64/inline-asm-error-N.ll b/llvm/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/inline-asm-error-N.ll
rename to llvm/test/CodeGen/AArch64/arm64-inline-asm-error-N.ll
diff --git a/llvm/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll b/llvm/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
rename to llvm/test/CodeGen/AArch64/arm64-inline-asm-zero-reg-error.ll
diff --git a/llvm/test/CodeGen/ARM64/inline-asm.ll b/llvm/test/CodeGen/AArch64/arm64-inline-asm.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/inline-asm.ll
rename to llvm/test/CodeGen/AArch64/arm64-inline-asm.ll
index e645078..d76cca3 100644
--- a/llvm/test/CodeGen/ARM64/inline-asm.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -no-integrated-as | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -no-integrated-as | FileCheck %s
 
 ; rdar://9167275
 
diff --git a/llvm/test/CodeGen/ARM64/join-reserved.ll b/llvm/test/CodeGen/AArch64/arm64-join-reserved.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/join-reserved.ll
rename to llvm/test/CodeGen/AArch64/arm64-join-reserved.ll
diff --git a/llvm/test/CodeGen/ARM64/jumptable.ll b/llvm/test/CodeGen/AArch64/arm64-jumptable.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/jumptable.ll
rename to llvm/test/CodeGen/AArch64/arm64-jumptable.ll
diff --git a/llvm/test/CodeGen/ARM64/aarch64-large-frame.ll b/llvm/test/CodeGen/AArch64/arm64-large-frame.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/aarch64-large-frame.ll
rename to llvm/test/CodeGen/AArch64/arm64-large-frame.ll
diff --git a/llvm/test/CodeGen/ARM64/ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
similarity index 67%
rename from llvm/test/CodeGen/ARM64/ld1.ll
rename to llvm/test/CodeGen/AArch64/arm64-ld1.ll
index 61836a1..72d808c 100644
--- a/llvm/test/CodeGen/ARM64/ld1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
 
 %struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
 %struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
@@ -10,7 +10,7 @@
 ; and from the argument of the function also defined by ABI (i.e., x0)
 ; CHECK ld2.8b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x2_t  %tmp2
 }
 
@@ -19,7 +19,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.8b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x3_t  %tmp2
 }
 
@@ -28,13 +28,13 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
 
 %struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
 %struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
@@ -45,7 +45,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.16b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
   ret %struct.__neon_int8x16x2_t  %tmp2
 }
 
@@ -54,7 +54,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.16b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
+  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8* %A)
   ret %struct.__neon_int8x16x3_t  %tmp2
 }
 
@@ -63,13 +63,13 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
+  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %A)
   ret %struct.__neon_int8x16x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
 
 %struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
 %struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
@@ -80,7 +80,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.4h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x2_t  %tmp2
 }
 
@@ -89,7 +89,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.4h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x3_t  %tmp2
 }
 
@@ -98,13 +98,13 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
 
 %struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
 %struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
@@ -115,7 +115,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.8h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x2_t  %tmp2
 }
 
@@ -124,7 +124,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.8h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x3_t %tmp2
 }
 
@@ -133,13 +133,13 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
 
 %struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
 %struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
@@ -150,7 +150,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.2s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x2_t  %tmp2
 }
 
@@ -159,7 +159,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.2s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x3_t  %tmp2
 }
 
@@ -168,13 +168,13 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
 
 %struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
 %struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
@@ -185,7 +185,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.4s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x2_t  %tmp2
 }
 
@@ -194,7 +194,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.4s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x3_t  %tmp2
 }
 
@@ -203,13 +203,13 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
 
 %struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
 %struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
@@ -220,7 +220,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2.2d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x2_t  %tmp2
 }
 
@@ -229,7 +229,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3.2d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x3_t  %tmp2
 }
 
@@ -238,13 +238,13 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
 
 %struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
 %struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
@@ -256,7 +256,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x2_t  %tmp2
 }
 
@@ -265,7 +265,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x3_t  %tmp2
 }
 
@@ -274,14 +274,14 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x4_t  %tmp2
 }
 
 
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
 
 %struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
 %struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
@@ -293,7 +293,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
+	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double* %A)
 	ret %struct.__neon_float64x1x2_t  %tmp2
 }
 
@@ -302,7 +302,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
+	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double* %A)
 	ret %struct.__neon_float64x1x3_t  %tmp2
 }
 
@@ -311,13 +311,13 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
+	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double* %A)
 	ret %struct.__neon_float64x1x4_t  %tmp2
 }
 
-declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
-declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
-declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
 
 
 define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
@@ -325,7 +325,7 @@
 ; CHECK: ld2lane_16b
 ; CHECK ld2.b { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
 	ret %struct.__neon_int8x16x2_t  %tmp2
 }
 
@@ -334,7 +334,7 @@
 ; CHECK: ld3lane_16b
 ; CHECK ld3.b { v0, v1, v2 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
 	ret %struct.__neon_int8x16x3_t  %tmp2
 }
 
@@ -343,20 +343,20 @@
 ; CHECK: ld4lane_16b
 ; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
 	ret %struct.__neon_int8x16x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
 
 define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld2lane_8h
 ; CHECK ld2.h { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
 	ret %struct.__neon_int16x8x2_t  %tmp2
 }
 
@@ -365,7 +365,7 @@
 ; CHECK: ld3lane_8h
 ; CHECK ld3.h { v0, v1, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
 	ret %struct.__neon_int16x8x3_t  %tmp2
 }
 
@@ -374,20 +374,20 @@
 ; CHECK: ld4lane_8h
 ; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
 	ret %struct.__neon_int16x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
 
 define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld2lane_4s
 ; CHECK ld2.s { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
 	ret %struct.__neon_int32x4x2_t  %tmp2
 }
 
@@ -396,7 +396,7 @@
 ; CHECK: ld3lane_4s
 ; CHECK ld3.s { v0, v1, v2 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
 	ret %struct.__neon_int32x4x3_t  %tmp2
 }
 
@@ -405,20 +405,20 @@
 ; CHECK: ld4lane_4s
 ; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
 	ret %struct.__neon_int32x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
 
 define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
 ; Make sure we are using the operands defined by the ABI
 ; CHECK: ld2lane_2d
 ; CHECK ld2.d { v0, v1 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
 	ret %struct.__neon_int64x2x2_t  %tmp2
 }
 
@@ -427,7 +427,7 @@
 ; CHECK: ld3lane_2d
 ; CHECK ld3.d { v0, v1, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
 	ret %struct.__neon_int64x2x3_t  %tmp2
 }
 
@@ -436,13 +436,13 @@
 ; CHECK: ld4lane_2d
 ; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
 	ret %struct.__neon_int64x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
 
 define <8 x i8> @ld1r_8b(i8* %bar) {
 ; CHECK: ld1r_8b
@@ -556,7 +556,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.8b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x2_t  %tmp2
 }
 
@@ -565,7 +565,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.8b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x3_t  %tmp2
 }
 
@@ -574,20 +574,20 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
 
 define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
 ; CHECK: ld2r_16b
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.16b { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x16x2_t  %tmp2
 }
 
@@ -596,7 +596,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.16b { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x16x3_t  %tmp2
 }
 
@@ -605,20 +605,20 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %A)
 	ret %struct.__neon_int8x16x4_t  %tmp2
 }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
 
 define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
 ; CHECK: ld2r_4h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.4h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x2_t  %tmp2
 }
 
@@ -627,7 +627,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.4h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x3_t  %tmp2
 }
 
@@ -636,20 +636,20 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* %A)
 	ret %struct.__neon_int16x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
 
 define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
 ; CHECK: ld2r_8h
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.8h { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x2_t  %tmp2
 }
 
@@ -658,7 +658,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.8h { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x3_t  %tmp2
 }
 
@@ -667,20 +667,20 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* %A)
   ret %struct.__neon_int16x8x4_t  %tmp2
 }
 
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
 
 define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
 ; CHECK: ld2r_2s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.2s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x2_t  %tmp2
 }
 
@@ -689,7 +689,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.2s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x3_t  %tmp2
 }
 
@@ -698,20 +698,20 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
 
 define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
 ; CHECK: ld2r_4s
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.4s { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x2_t  %tmp2
 }
 
@@ -720,7 +720,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.4s { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x3_t  %tmp2
 }
 
@@ -729,20 +729,20 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* %A)
 	ret %struct.__neon_int32x4x4_t  %tmp2
 }
 
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
 
 define %struct.__neon_int64x1x2_t @ld2r_1d(i64* %A) nounwind {
 ; CHECK: ld2r_1d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.1d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x2_t  %tmp2
 }
 
@@ -751,7 +751,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.1d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x3_t  %tmp2
 }
 
@@ -760,20 +760,20 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.1d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x1x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64*) nounwind readonly
 
 define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
 ; CHECK: ld2r_2d
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld2r.2d { v0, v1 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x2_t  %tmp2
 }
 
@@ -782,7 +782,7 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld3r.2d { v0, v1, v2 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x3_t  %tmp2
 }
 
@@ -791,13 +791,13 @@
 ; Make sure we are using the operands defined by the ABI
 ; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
 ; CHECK-NEXT ret
-	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* %A)
 	ret %struct.__neon_int64x2x4_t  %tmp2
 }
 
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
 
 define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
 ; CHECK-LABEL: ld1_16b
@@ -1041,52 +1041,52 @@
 %struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
 %struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
 
-declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x2_v8i8:
 ; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x8x2_t @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x8x2_t %val
 }
 
 define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x2_v4i16:
 ; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x4x2_t @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x4x2_t %val
 }
 
 define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x2_v2i32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x2x2_t @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x2x2_t %val
 }
 
 define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
 ; CHECK-LABEL: ld1_x2_v2f32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x2x2_t @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* %addr)
   ret %struct.__neon_float32x2x2_t %val
 }
 
 define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x2_v1i64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x1x2_t @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x1x2_t %val
 }
 
 define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
 ; CHECK-LABEL: ld1_x2_v1f64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x1x2_t @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* %addr)
   ret %struct.__neon_float64x1x2_t %val
 }
 
@@ -1099,247 +1099,247 @@
 %struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
 %struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
 
-declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x2_v16i8:
 ; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x16x2_t @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x16x2_t %val
 }
 
 define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x2_v8i16:
 ; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x8x2_t @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x8x2_t %val
 }
 
 define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x2_v4i32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x4x2_t @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x4x2_t %val
 }
 
 define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
 ; CHECK-LABEL: ld1_x2_v4f32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x4x2_t @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* %addr)
   ret %struct.__neon_float32x4x2_t %val
 }
 
 define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x2_v2i64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x2x2_t @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x2x2_t %val
 }
 
 define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
 ; CHECK-LABEL: ld1_x2_v2f64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x2x2_t @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* %addr)
   ret %struct.__neon_float64x2x2_t %val
 }
 
-declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x3_v8i8:
 ; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x8x3_t @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x8x3_t %val
 }
 
 define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x3_v4i16:
 ; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x4x3_t @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x4x3_t %val
 }
 
 define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x3_v2i32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x2x3_t @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x2x3_t %val
 }
 
 define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
 ; CHECK-LABEL: ld1_x3_v2f32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x2x3_t @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* %addr)
   ret %struct.__neon_float32x2x3_t %val
 }
 
 define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x3_v1i64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x1x3_t @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x1x3_t %val
 }
 
 define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
 ; CHECK-LABEL: ld1_x3_v1f64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x1x3_t @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* %addr)
   ret %struct.__neon_float64x1x3_t %val
 }
 
-declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x3_v16i8:
 ; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x16x3_t @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x16x3_t %val
 }
 
 define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x3_v8i16:
 ; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x8x3_t @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x8x3_t %val
 }
 
 define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x3_v4i32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x4x3_t @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x4x3_t %val
 }
 
 define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
 ; CHECK-LABEL: ld1_x3_v4f32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x4x3_t @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* %addr)
   ret %struct.__neon_float32x4x3_t %val
 }
 
 define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x3_v2i64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x2x3_t @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x2x3_t %val
 }
 
 define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
 ; CHECK-LABEL: ld1_x3_v2f64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x2x3_t @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* %addr)
   ret %struct.__neon_float64x2x3_t %val
 }
 
-declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x4_v8i8:
 ; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x8x4_t @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x8x4_t %val
 }
 
 define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x4_v4i16:
 ; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x4x4_t @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x4x4_t %val
 }
 
 define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x4_v2i32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x2x4_t @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x2x4_t %val
 }
 
 define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
 ; CHECK-LABEL: ld1_x4_v2f32:
 ; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x2x4_t @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* %addr)
   ret %struct.__neon_float32x2x4_t %val
 }
 
 define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x4_v1i64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x1x4_t @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x1x4_t %val
 }
 
 define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
 ; CHECK-LABEL: ld1_x4_v1f64:
 ; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x1x4_t @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* %addr)
   ret %struct.__neon_float64x1x4_t %val
 }
 
-declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
-declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
-declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
-declare %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
-declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
-declare %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
 
 define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
 ; CHECK-LABEL: ld1_x4_v16i8:
 ; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %addr)
+  %val = call %struct.__neon_int8x16x4_t @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %addr)
   ret %struct.__neon_int8x16x4_t %val
 }
 
 define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
 ; CHECK-LABEL: ld1_x4_v8i16:
 ; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %addr)
+  %val = call %struct.__neon_int16x8x4_t @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* %addr)
   ret %struct.__neon_int16x8x4_t %val
 }
 
 define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
 ; CHECK-LABEL: ld1_x4_v4i32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %addr)
+  %val = call %struct.__neon_int32x4x4_t @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* %addr)
   ret %struct.__neon_int32x4x4_t %val
 }
 
 define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
 ; CHECK-LABEL: ld1_x4_v4f32:
 ; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %addr)
+  %val = call %struct.__neon_float32x4x4_t @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* %addr)
   ret %struct.__neon_float32x4x4_t %val
 }
 
 define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
 ; CHECK-LABEL: ld1_x4_v2i64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %addr)
+  %val = call %struct.__neon_int64x2x4_t @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* %addr)
   ret %struct.__neon_int64x2x4_t %val
 }
 
 define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
 ; CHECK-LABEL: ld1_x4_v2f64:
 ; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  %val = call %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %addr)
+  %val = call %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* %addr)
   ret %struct.__neon_float64x2x4_t %val
 }
diff --git a/llvm/test/CodeGen/ARM64/ldp.ll b/llvm/test/CodeGen/AArch64/arm64-ldp.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/ldp.ll
rename to llvm/test/CodeGen/AArch64/arm64-ldp.ll
index 9444385..5a98626 100644
--- a/llvm/test/CodeGen/ARM64/ldp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
 ; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
 
 ; CHECK: ldp_int
diff --git a/llvm/test/CodeGen/ARM64/ldur.ll b/llvm/test/CodeGen/AArch64/arm64-ldur.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/ldur.ll
rename to llvm/test/CodeGen/AArch64/arm64-ldur.ll
diff --git a/llvm/test/CodeGen/ARM64/ldxr-stxr.ll b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
similarity index 69%
rename from llvm/test/CodeGen/ARM64/ldxr-stxr.ll
rename to llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
index ed53a14..9093df2 100644
--- a/llvm/test/CodeGen/ARM64/ldxr-stxr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ldxr-stxr.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: f0:
 ; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 entry:
-  %ldrexd = tail call %0 @llvm.arm64.ldxp(i8* %p)
+  %ldrexd = tail call %0 @llvm.aarch64.ldxp(i8* %p)
   %0 = extractvalue %0 %ldrexd, 1
   %1 = extractvalue %0 %ldrexd, 0
   %2 = zext i64 %0 to i128
@@ -23,12 +23,12 @@
   %tmp4 = trunc i128 %val to i64
   %tmp6 = lshr i128 %val, 64
   %tmp7 = trunc i128 %tmp6 to i64
-  %strexd = tail call i32 @llvm.arm64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  %strexd = tail call i32 @llvm.aarch64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
   ret i32 %strexd
 }
 
-declare %0 @llvm.arm64.ldxp(i8*) nounwind
-declare i32 @llvm.arm64.stxp(i64, i64, i8*) nounwind
+declare %0 @llvm.aarch64.ldxp(i8*) nounwind
+declare i32 @llvm.aarch64.stxp(i64, i64, i8*) nounwind
 
 @var = global i64 0, align 8
 
@@ -39,7 +39,7 @@
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldxr.p0i8(i8* %addr)
+  %val = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
   %shortval = trunc i64 %val to i8
   %extval = zext i8 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -53,7 +53,7 @@
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldxr.p0i16(i16* %addr)
+  %val = call i64 @llvm.aarch64.ldxr.p0i16(i16* %addr)
   %shortval = trunc i64 %val to i16
   %extval = zext i16 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -67,7 +67,7 @@
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldxr.p0i32(i32* %addr)
+  %val = call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
   %shortval = trunc i64 %val to i32
   %extval = zext i32 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -79,16 +79,16 @@
 ; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldxr.p0i64(i64* %addr)
+  %val = call i64 @llvm.aarch64.ldxr.p0i64(i64* %addr)
   store i64 %val, i64* @var, align 8
   ret void
 }
 
 
-declare i64 @llvm.arm64.ldxr.p0i8(i8*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i16(i16*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i32(i32*) nounwind
-declare i64 @llvm.arm64.ldxr.p0i64(i64*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldxr.p0i64(i64*) nounwind
 
 define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
 ; CHECK-LABEL: test_store_i8:
@@ -96,7 +96,7 @@
 ; CHECK-NOT: and
 ; CHECK: stxrb w0, w1, [x2]
   %extval = zext i8 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i8(i64 %extval, i8* %addr)
+  %res = call i32 @llvm.aarch64.stxr.p0i8(i64 %extval, i8* %addr)
   ret i32 %res
 }
 
@@ -106,7 +106,7 @@
 ; CHECK-NOT: and
 ; CHECK: stxrh w0, w1, [x2]
   %extval = zext i16 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i16(i64 %extval, i16* %addr)
+  %res = call i32 @llvm.aarch64.stxr.p0i16(i64 %extval, i16* %addr)
   ret i32 %res
 }
 
@@ -116,36 +116,36 @@
 ; CHECK-NOT: and
 ; CHECK: stxr w0, w1, [x2]
   %extval = zext i32 %val to i64
-  %res = call i32 @llvm.arm64.stxr.p0i32(i64 %extval, i32* %addr)
+  %res = call i32 @llvm.aarch64.stxr.p0i32(i64 %extval, i32* %addr)
   ret i32 %res
 }
 
 define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
 ; CHECK-LABEL: test_store_i64:
 ; CHECK: stxr w0, x1, [x2]
-  %res = call i32 @llvm.arm64.stxr.p0i64(i64 %val, i64* %addr)
+  %res = call i32 @llvm.aarch64.stxr.p0i64(i64 %val, i64* %addr)
   ret i32 %res
 }
 
-declare i32 @llvm.arm64.stxr.p0i8(i64, i8*) nounwind
-declare i32 @llvm.arm64.stxr.p0i16(i64, i16*) nounwind
-declare i32 @llvm.arm64.stxr.p0i32(i64, i32*) nounwind
-declare i32 @llvm.arm64.stxr.p0i64(i64, i64*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stxr.p0i64(i64, i64*) nounwind
 
 ; CHECK: test_clear:
 ; CHECK: clrex
 define void @test_clear() {
-  call void @llvm.arm64.clrex()
+  call void @llvm.aarch64.clrex()
   ret void
 }
 
-declare void @llvm.arm64.clrex() nounwind
+declare void @llvm.aarch64.clrex() nounwind
 
 define i128 @test_load_acquire_i128(i8* %p) nounwind readonly {
 ; CHECK-LABEL: test_load_acquire_i128:
 ; CHECK: ldaxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 entry:
-  %ldrexd = tail call %0 @llvm.arm64.ldaxp(i8* %p)
+  %ldrexd = tail call %0 @llvm.aarch64.ldaxp(i8* %p)
   %0 = extractvalue %0 %ldrexd, 1
   %1 = extractvalue %0 %ldrexd, 0
   %2 = zext i64 %0 to i128
@@ -162,12 +162,12 @@
   %tmp4 = trunc i128 %val to i64
   %tmp6 = lshr i128 %val, 64
   %tmp7 = trunc i128 %tmp6 to i64
-  %strexd = tail call i32 @llvm.arm64.stlxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  %strexd = tail call i32 @llvm.aarch64.stlxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
   ret i32 %strexd
 }
 
-declare %0 @llvm.arm64.ldaxp(i8*) nounwind
-declare i32 @llvm.arm64.stlxp(i64, i64, i8*) nounwind
+declare %0 @llvm.aarch64.ldaxp(i8*) nounwind
+declare i32 @llvm.aarch64.stlxp(i64, i64, i8*) nounwind
 
 define void @test_load_acquire_i8(i8* %addr) {
 ; CHECK-LABEL: test_load_acquire_i8:
@@ -176,7 +176,7 @@
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldaxr.p0i8(i8* %addr)
+  %val = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
   %shortval = trunc i64 %val to i8
   %extval = zext i8 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -190,7 +190,7 @@
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldaxr.p0i16(i16* %addr)
+  %val = call i64 @llvm.aarch64.ldaxr.p0i16(i16* %addr)
   %shortval = trunc i64 %val to i16
   %extval = zext i16 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -204,7 +204,7 @@
 ; CHECK-NOT: and
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldaxr.p0i32(i32* %addr)
+  %val = call i64 @llvm.aarch64.ldaxr.p0i32(i32* %addr)
   %shortval = trunc i64 %val to i32
   %extval = zext i32 %shortval to i64
   store i64 %extval, i64* @var, align 8
@@ -216,16 +216,16 @@
 ; CHECK: ldaxr x[[LOADVAL:[0-9]+]], [x0]
 ; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
 
-  %val = call i64 @llvm.arm64.ldaxr.p0i64(i64* %addr)
+  %val = call i64 @llvm.aarch64.ldaxr.p0i64(i64* %addr)
   store i64 %val, i64* @var, align 8
   ret void
 }
 
 
-declare i64 @llvm.arm64.ldaxr.p0i8(i8*) nounwind
-declare i64 @llvm.arm64.ldaxr.p0i16(i16*) nounwind
-declare i64 @llvm.arm64.ldaxr.p0i32(i32*) nounwind
-declare i64 @llvm.arm64.ldaxr.p0i64(i64*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i8(i8*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i16(i16*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i32(i32*) nounwind
+declare i64 @llvm.aarch64.ldaxr.p0i64(i64*) nounwind
 
 define i32 @test_store_release_i8(i32, i8 %val, i8* %addr) {
 ; CHECK-LABEL: test_store_release_i8:
@@ -233,7 +233,7 @@
 ; CHECK-NOT: and
 ; CHECK: stlxrb w0, w1, [x2]
   %extval = zext i8 %val to i64
-  %res = call i32 @llvm.arm64.stlxr.p0i8(i64 %extval, i8* %addr)
+  %res = call i32 @llvm.aarch64.stlxr.p0i8(i64 %extval, i8* %addr)
   ret i32 %res
 }
 
@@ -243,7 +243,7 @@
 ; CHECK-NOT: and
 ; CHECK: stlxrh w0, w1, [x2]
   %extval = zext i16 %val to i64
-  %res = call i32 @llvm.arm64.stlxr.p0i16(i64 %extval, i16* %addr)
+  %res = call i32 @llvm.aarch64.stlxr.p0i16(i64 %extval, i16* %addr)
   ret i32 %res
 }
 
@@ -253,18 +253,18 @@
 ; CHECK-NOT: and
 ; CHECK: stlxr w0, w1, [x2]
   %extval = zext i32 %val to i64
-  %res = call i32 @llvm.arm64.stlxr.p0i32(i64 %extval, i32* %addr)
+  %res = call i32 @llvm.aarch64.stlxr.p0i32(i64 %extval, i32* %addr)
   ret i32 %res
 }
 
 define i32 @test_store_release_i64(i32, i64 %val, i64* %addr) {
 ; CHECK-LABEL: test_store_release_i64:
 ; CHECK: stlxr w0, x1, [x2]
-  %res = call i32 @llvm.arm64.stlxr.p0i64(i64 %val, i64* %addr)
+  %res = call i32 @llvm.aarch64.stlxr.p0i64(i64 %val, i64* %addr)
   ret i32 %res
 }
 
-declare i32 @llvm.arm64.stlxr.p0i8(i64, i8*) nounwind
-declare i32 @llvm.arm64.stlxr.p0i16(i64, i16*) nounwind
-declare i32 @llvm.arm64.stlxr.p0i32(i64, i32*) nounwind
-declare i32 @llvm.arm64.stlxr.p0i64(i64, i64*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.aarch64.stlxr.p0i64(i64, i64*) nounwind
diff --git a/llvm/test/CodeGen/ARM64/leaf.ll b/llvm/test/CodeGen/AArch64/arm64-leaf.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/leaf.ll
rename to llvm/test/CodeGen/AArch64/arm64-leaf.ll
diff --git a/llvm/test/CodeGen/ARM64/long-shift.ll b/llvm/test/CodeGen/AArch64/arm64-long-shift.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/long-shift.ll
rename to llvm/test/CodeGen/AArch64/arm64-long-shift.ll
diff --git a/llvm/test/CodeGen/ARM64/memcpy-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/memcpy-inline.ll
rename to llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll
diff --git a/llvm/test/CodeGen/ARM64/memset-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/memset-inline.ll
rename to llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
diff --git a/llvm/test/CodeGen/ARM64/memset-to-bzero.ll b/llvm/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/memset-to-bzero.ll
rename to llvm/test/CodeGen/AArch64/arm64-memset-to-bzero.ll
diff --git a/llvm/test/CodeGen/ARM64/misched-basic-A53.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
similarity index 96%
rename from llvm/test/CodeGen/ARM64/misched-basic-A53.ll
rename to llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index d69b097..f88bd6a 100644
--- a/llvm/test/CodeGen/ARM64/misched-basic-A53.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -115,10 +115,10 @@
 ;
 ; Nothing explicit to check other than llc not crashing.
 define { <16 x i8>, <16 x i8> } @test_v16i8_post_imm_ld2(i8* %A, i8** %ptr) {
-  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  %ld2 = tail call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8* %A)
   %tmp = getelementptr i8* %A, i32 32
   store i8* %tmp, i8** %ptr
   ret { <16 x i8>, <16 x i8> } %ld2
 }
 
-declare { <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld2.v16i8.p0i8(i8*)
+declare { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0i8(i8*)
diff --git a/llvm/test/CodeGen/ARM64/misched-forwarding-A53.ll b/llvm/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/misched-forwarding-A53.ll
rename to llvm/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
diff --git a/llvm/test/CodeGen/ARM64/movi.ll b/llvm/test/CodeGen/AArch64/arm64-movi.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/movi.ll
rename to llvm/test/CodeGen/AArch64/arm64-movi.ll
diff --git a/llvm/test/CodeGen/ARM64/mul.ll b/llvm/test/CodeGen/AArch64/arm64-mul.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/mul.ll
rename to llvm/test/CodeGen/AArch64/arm64-mul.ll
diff --git a/llvm/test/CodeGen/ARM64/named-reg-alloc.ll b/llvm/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/named-reg-alloc.ll
rename to llvm/test/CodeGen/AArch64/arm64-named-reg-alloc.ll
diff --git a/llvm/test/CodeGen/ARM64/named-reg-notareg.ll b/llvm/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/named-reg-notareg.ll
rename to llvm/test/CodeGen/AArch64/arm64-named-reg-notareg.ll
diff --git a/llvm/test/CodeGen/ARM64/neg.ll b/llvm/test/CodeGen/AArch64/arm64-neg.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/neg.ll
rename to llvm/test/CodeGen/AArch64/arm64-neg.ll
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-2velem-high.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
similarity index 82%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-2velem-high.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
index 2013747..58df094 100644
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-2velem-high.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem-high.ll
@@ -4,25 +4,25 @@
 
 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 
 define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) {
 ; CHECK-LABEL: test_vmull_high_n_s16:
@@ -34,7 +34,7 @@
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   ret <4 x i32> %vmull15.i.i
 }
 
@@ -46,7 +46,7 @@
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   ret <2 x i64> %vmull9.i.i
 }
 
@@ -60,7 +60,7 @@
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vmull15.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   ret <4 x i32> %vmull15.i.i
 }
 
@@ -72,7 +72,7 @@
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vmull9.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   ret <2 x i64> %vmull9.i.i
 }
 
@@ -86,7 +86,7 @@
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3
-  %vqdmull15.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmull15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   ret <4 x i32> %vqdmull15.i.i
 }
 
@@ -98,7 +98,7 @@
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1
-  %vqdmull9.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmull9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   ret <2 x i64> %vqdmull9.i.i
 }
 
@@ -112,7 +112,7 @@
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
@@ -125,7 +125,7 @@
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
@@ -140,7 +140,7 @@
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
@@ -153,7 +153,7 @@
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
@@ -167,8 +167,8 @@
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
+  %vqdmlal15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlal17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i)
   ret <4 x i32> %vqdmlal17.i.i
 }
 
@@ -179,8 +179,8 @@
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
+  %vqdmlal9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlal11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i)
   ret <2 x i64> %vqdmlal11.i.i
 }
 
@@ -193,7 +193,7 @@
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
@@ -205,7 +205,7 @@
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
@@ -219,7 +219,7 @@
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
@@ -231,7 +231,7 @@
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
@@ -245,8 +245,8 @@
   %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1
   %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2
   %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3
-  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
-  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
+  %vqdmlsl15.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i)
+  %vqdmlsl17.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i)
   ret <4 x i32> %vqdmlsl17.i.i
 }
 
@@ -257,8 +257,8 @@
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0
   %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1
-  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
-  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
+  %vqdmlsl9.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i)
+  %vqdmlsl11.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i)
   ret <2 x i64> %vqdmlsl11.i.i
 }
 
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
similarity index 83%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-2velem.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
index 4c6b72d..869966c 100644
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-2velem.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -1,46 +1,46 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
 
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
 
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
 
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
 
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
 
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
 
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
 
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
 
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 
 define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) {
 ; CHECK-LABEL: test_vmla_lane_s16:
@@ -563,7 +563,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -574,7 +574,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -585,7 +585,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -596,7 +596,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -608,7 +608,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -620,7 +620,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -632,7 +632,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -644,7 +644,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -655,7 +655,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -666,7 +666,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -677,7 +677,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -688,7 +688,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -700,7 +700,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -712,7 +712,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -724,7 +724,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -736,7 +736,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -747,7 +747,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -758,7 +758,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -769,7 +769,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -780,7 +780,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -792,7 +792,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -804,7 +804,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -816,7 +816,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -828,7 +828,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -839,7 +839,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -850,7 +850,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -861,7 +861,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -872,7 +872,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -884,7 +884,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -896,7 +896,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -908,7 +908,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -920,7 +920,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -931,7 +931,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -941,7 +941,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -951,7 +951,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -961,7 +961,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -972,7 +972,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -983,7 +983,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -994,7 +994,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1005,7 +1005,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1015,7 +1015,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1025,7 +1025,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1035,7 +1035,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1045,7 +1045,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1056,7 +1056,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1067,7 +1067,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1078,7 +1078,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1089,7 +1089,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1099,8 +1099,8 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
@@ -1110,8 +1110,8 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
@@ -1122,8 +1122,8 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
@@ -1134,8 +1134,8 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
@@ -1145,8 +1145,8 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
@@ -1156,8 +1156,8 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
@@ -1168,8 +1168,8 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
@@ -1180,8 +1180,8 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
@@ -1191,7 +1191,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -1201,7 +1201,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -1211,7 +1211,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -1221,7 +1221,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -1232,7 +1232,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -1243,7 +1243,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -1254,7 +1254,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -1265,7 +1265,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -1275,7 +1275,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqdmulh2.i
 }
 
@@ -1285,7 +1285,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqdmulh2.i
 }
 
@@ -1295,7 +1295,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqdmulh2.i
 }
 
@@ -1305,7 +1305,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqdmulh2.i
 }
 
@@ -1315,7 +1315,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqrdmulh2.i
 }
 
@@ -1325,7 +1325,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqrdmulh2.i
 }
 
@@ -1335,7 +1335,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqrdmulh2.i
 }
 
@@ -1345,7 +1345,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqrdmulh2.i
 }
 
@@ -1441,7 +1441,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
@@ -1451,7 +1451,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
@@ -1461,7 +1461,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
@@ -1471,7 +1471,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
@@ -1481,7 +1481,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
@@ -1491,7 +1491,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
@@ -1942,7 +1942,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -1953,7 +1953,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -1964,7 +1964,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -1975,7 +1975,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -1987,7 +1987,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -1999,7 +1999,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2011,7 +2011,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -2023,7 +2023,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2034,7 +2034,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2045,7 +2045,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2056,7 +2056,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2067,7 +2067,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2079,7 +2079,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2091,7 +2091,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2103,7 +2103,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2115,7 +2115,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2126,7 +2126,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -2137,7 +2137,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2148,7 +2148,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -2159,7 +2159,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2171,7 +2171,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -2183,7 +2183,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2195,7 +2195,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %add = add <4 x i32> %vmull2.i, %a
   ret <4 x i32> %add
 }
@@ -2207,7 +2207,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %add = add <2 x i64> %vmull2.i, %a
   ret <2 x i64> %add
 }
@@ -2218,7 +2218,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2229,7 +2229,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2240,7 +2240,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2251,7 +2251,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2263,7 +2263,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2275,7 +2275,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2287,7 +2287,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   %sub = sub <4 x i32> %a, %vmull2.i
   ret <4 x i32> %sub
 }
@@ -2299,7 +2299,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   %sub = sub <2 x i64> %a, %vmull2.i
   ret <2 x i64> %sub
 }
@@ -2310,7 +2310,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2320,7 +2320,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2330,7 +2330,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2340,7 +2340,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2351,7 +2351,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2362,7 +2362,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2373,7 +2373,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2384,7 +2384,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2394,7 +2394,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2404,7 +2404,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2414,7 +2414,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2424,7 +2424,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2435,7 +2435,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2446,7 +2446,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2457,7 +2457,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vmull2.i
 }
 
@@ -2468,7 +2468,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vmull2.i
 }
 
@@ -2478,8 +2478,8 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
@@ -2489,8 +2489,8 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
@@ -2501,8 +2501,8 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
@@ -2513,8 +2513,8 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
@@ -2524,8 +2524,8 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
@@ -2535,8 +2535,8 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
@@ -2547,8 +2547,8 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
@@ -2559,8 +2559,8 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
@@ -2570,7 +2570,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -2580,7 +2580,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -2590,7 +2590,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -2600,7 +2600,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -2611,7 +2611,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -2622,7 +2622,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -2633,7 +2633,7 @@
 entry:
   %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i, <4 x i16> %shuffle)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -2644,7 +2644,7 @@
 entry:
   %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i, <2 x i32> %shuffle)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -2654,7 +2654,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqdmulh2.i
 }
 
@@ -2664,7 +2664,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqdmulh2.i
 }
 
@@ -2674,7 +2674,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqdmulh2.i
 }
 
@@ -2684,7 +2684,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqdmulh2.i
 }
 
@@ -2694,7 +2694,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %a, <4 x i16> %shuffle)
   ret <4 x i16> %vqrdmulh2.i
 }
 
@@ -2704,7 +2704,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
+  %vqrdmulh2.i = tail call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %a, <8 x i16> %shuffle)
   ret <8 x i16> %vqrdmulh2.i
 }
 
@@ -2714,7 +2714,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %a, <2 x i32> %shuffle)
   ret <2 x i32> %vqrdmulh2.i
 }
 
@@ -2724,7 +2724,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
-  %vqrdmulh2.i = tail call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
+  %vqrdmulh2.i = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %a, <4 x i32> %shuffle)
   ret <4 x i32> %vqrdmulh2.i
 }
 
@@ -2797,7 +2797,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
@@ -2807,7 +2807,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x float> %v, <2 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
@@ -2817,7 +2817,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <1 x double> %v, <1 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
@@ -2827,7 +2827,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
+  %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle)
   ret <2 x float> %vmulx2.i
 }
 
@@ -2837,7 +2837,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <4 x i32> zeroinitializer
-  %vmulx2.i = tail call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
+  %vmulx2.i = tail call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %a, <4 x float> %shuffle)
   ret <4 x float> %vmulx2.i
 }
 
@@ -2847,7 +2847,7 @@
 ; CHECK-NEXT: ret
 entry:
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> zeroinitializer
-  %vmulx2.i = tail call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
+  %vmulx2.i = tail call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %a, <2 x double> %shuffle)
   ret <2 x double> %vmulx2.i
 }
 
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-3vdiff.ll b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
similarity index 85%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-3vdiff.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
index a479844..cb9b36c 100644
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-3vdiff.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-3vdiff.ll
@@ -1,54 +1,54 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
 
-declare <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>)
 
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
 
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
 
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
 
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
 
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
 
-declare <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
+declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
 
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
 
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
 
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
 
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
 
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
 
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
 
-declare <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>)
 
-declare <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>)
 
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>)
 
-declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>)
 
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>)
 
-declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>)
 
 define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_vaddl_s8:
@@ -690,7 +690,7 @@
 ; CHECK-LABEL: test_vraddhn_s16:
 ; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vraddhn2.i
 }
 
@@ -698,7 +698,7 @@
 ; CHECK-LABEL: test_vraddhn_s32:
 ; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vraddhn2.i
 }
 
@@ -706,7 +706,7 @@
 ; CHECK-LABEL: test_vraddhn_s64:
 ; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vraddhn2.i
 }
 
@@ -714,7 +714,7 @@
 ; CHECK-LABEL: test_vraddhn_u16:
 ; CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vraddhn2.i
 }
 
@@ -722,7 +722,7 @@
 ; CHECK-LABEL: test_vraddhn_u32:
 ; CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vraddhn2.i
 }
 
@@ -730,7 +730,7 @@
 ; CHECK-LABEL: test_vraddhn_u64:
 ; CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vraddhn2.i
 }
 
@@ -738,7 +738,7 @@
 ; CHECK-LABEL: test_vraddhn_high_s16:
 ; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -750,7 +750,7 @@
 ; CHECK-LABEL: test_vraddhn_high_s32:
 ; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -762,7 +762,7 @@
 ; CHECK-LABEL: test_vraddhn_high_s64:
 ; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -774,7 +774,7 @@
 ; CHECK-LABEL: test_vraddhn_high_u16:
 ; CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vraddhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vraddhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -786,7 +786,7 @@
 ; CHECK-LABEL: test_vraddhn_high_u32:
 ; CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vraddhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vraddhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -798,7 +798,7 @@
 ; CHECK-LABEL: test_vraddhn_high_u64:
 ; CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vraddhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vraddhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vraddhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -954,7 +954,7 @@
 ; CHECK-LABEL: test_vrsubhn_s16:
 ; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vrsubhn2.i
 }
 
@@ -962,7 +962,7 @@
 ; CHECK-LABEL: test_vrsubhn_s32:
 ; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vrsubhn2.i
 }
 
@@ -970,7 +970,7 @@
 ; CHECK-LABEL: test_vrsubhn_s64:
 ; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vrsubhn2.i
 }
 
@@ -978,7 +978,7 @@
 ; CHECK-LABEL: test_vrsubhn_u16:
 ; CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i8> %vrsubhn2.i
 }
 
@@ -986,7 +986,7 @@
 ; CHECK-LABEL: test_vrsubhn_u32:
 ; CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i16> %vrsubhn2.i
 }
 
@@ -994,7 +994,7 @@
 ; CHECK-LABEL: test_vrsubhn_u64:
 ; CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i32> %vrsubhn2.i
 }
 
@@ -1002,7 +1002,7 @@
 ; CHECK-LABEL: test_vrsubhn_high_s16:
 ; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1014,7 +1014,7 @@
 ; CHECK-LABEL: test_vrsubhn_high_s32:
 ; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1026,7 +1026,7 @@
 ; CHECK-LABEL: test_vrsubhn_high_s64:
 ; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1038,7 +1038,7 @@
 ; CHECK-LABEL: test_vrsubhn_high_u16:
 ; CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 entry:
-  %vrsubhn2.i.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
+  %vrsubhn2.i.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b)
   %0 = bitcast <8 x i8> %r to <1 x i64>
   %1 = bitcast <8 x i8> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1050,7 +1050,7 @@
 ; CHECK-LABEL: test_vrsubhn_high_u32:
 ; CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 entry:
-  %vrsubhn2.i.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
+  %vrsubhn2.i.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b)
   %0 = bitcast <4 x i16> %r to <1 x i64>
   %1 = bitcast <4 x i16> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1062,7 +1062,7 @@
 ; CHECK-LABEL: test_vrsubhn_high_u64:
 ; CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
-  %vrsubhn2.i.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
+  %vrsubhn2.i.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b)
   %0 = bitcast <2 x i32> %r to <1 x i64>
   %1 = bitcast <2 x i32> %vrsubhn2.i.i to <1 x i64>
   %shuffle.i.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -1074,7 +1074,7 @@
 ; CHECK-LABEL: test_vabdl_s8:
 ; CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i
 }
@@ -1083,7 +1083,7 @@
 ; CHECK-LABEL: test_vabdl_s16:
 ; CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i
 }
@@ -1092,7 +1092,7 @@
 ; CHECK-LABEL: test_vabdl_s32:
 ; CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %a, <2 x i32> %b)
   %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i
 }
@@ -1101,7 +1101,7 @@
 ; CHECK-LABEL: test_vabdl_u8:
 ; CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
+  %vabd.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b)
   %vmovl.i.i = zext <8 x i8> %vabd.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i
 }
@@ -1110,7 +1110,7 @@
 ; CHECK-LABEL: test_vabdl_u16:
 ; CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
+  %vabd2.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %a, <4 x i16> %b)
   %vmovl.i.i = zext <4 x i16> %vabd2.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i
 }
@@ -1119,7 +1119,7 @@
 ; CHECK-LABEL: test_vabdl_u32:
 ; CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
+  %vabd2.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %a, <2 x i32> %b)
   %vmovl.i.i = zext <2 x i32> %vabd2.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i
 }
@@ -1128,7 +1128,7 @@
 ; CHECK-LABEL: test_vabal_s8:
 ; CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   %add.i = add <8 x i16> %vmovl.i.i.i, %a
   ret <8 x i16> %add.i
@@ -1138,7 +1138,7 @@
 ; CHECK-LABEL: test_vabal_s16:
 ; CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> %c)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   %add.i = add <4 x i32> %vmovl.i.i.i, %a
   ret <4 x i32> %add.i
@@ -1148,7 +1148,7 @@
 ; CHECK-LABEL: test_vabal_s32:
 ; CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> %c)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   %add.i = add <2 x i64> %vmovl.i.i.i, %a
   ret <2 x i64> %add.i
@@ -1158,7 +1158,7 @@
 ; CHECK-LABEL: test_vabal_u8:
 ; CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   %add.i = add <8 x i16> %vmovl.i.i.i, %a
   ret <8 x i16> %add.i
@@ -1168,7 +1168,7 @@
 ; CHECK-LABEL: test_vabal_u16:
 ; CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %b, <4 x i16> %c)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   %add.i = add <4 x i32> %vmovl.i.i.i, %a
   ret <4 x i32> %add.i
@@ -1178,7 +1178,7 @@
 ; CHECK-LABEL: test_vabal_u32:
 ; CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %b, <2 x i32> %c)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   %add.i = add <2 x i64> %vmovl.i.i.i, %a
   ret <2 x i64> %add.i
@@ -1190,7 +1190,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i.i
 }
@@ -1201,7 +1201,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i.i
 }
@@ -1212,7 +1212,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i.i
 }
@@ -1223,7 +1223,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i = zext <8 x i8> %vabd.i.i.i to <8 x i16>
   ret <8 x i16> %vmovl.i.i.i
 }
@@ -1234,7 +1234,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i = zext <4 x i16> %vabd2.i.i.i to <4 x i32>
   ret <4 x i32> %vmovl.i.i.i
 }
@@ -1245,7 +1245,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i = zext <2 x i32> %vabd2.i.i.i to <2 x i64>
   ret <2 x i64> %vmovl.i.i.i
 }
@@ -1256,7 +1256,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
   %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
   ret <8 x i16> %add.i.i
@@ -1268,7 +1268,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
   %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
   ret <4 x i32> %add.i.i
@@ -1280,7 +1280,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
   %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
   ret <2 x i64> %add.i.i
@@ -1292,7 +1292,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vabd.i.i.i.i = tail call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vabd.i.i.i.i = tail call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <8 x i8> %vabd.i.i.i.i to <8 x i16>
   %add.i.i = add <8 x i16> %vmovl.i.i.i.i, %a
   ret <8 x i16> %add.i.i
@@ -1304,7 +1304,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <4 x i16> %vabd2.i.i.i.i to <4 x i32>
   %add.i.i = add <4 x i32> %vmovl.i.i.i.i, %a
   ret <4 x i32> %add.i.i
@@ -1316,7 +1316,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vabd2.i.i.i.i = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %vmovl.i.i.i.i = zext <2 x i32> %vabd2.i.i.i.i to <2 x i64>
   %add.i.i = add <2 x i64> %vmovl.i.i.i.i, %a
   ret <2 x i64> %add.i.i
@@ -1326,7 +1326,7 @@
 ; CHECK-LABEL: test_vmull_s8:
 ; CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
   ret <8 x i16> %vmull.i
 }
 
@@ -1334,7 +1334,7 @@
 ; CHECK-LABEL: test_vmull_s16:
 ; CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1342,7 +1342,7 @@
 ; CHECK-LABEL: test_vmull_s32:
 ; CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1350,7 +1350,7 @@
 ; CHECK-LABEL: test_vmull_u8:
 ; CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
   ret <8 x i16> %vmull.i
 }
 
@@ -1358,7 +1358,7 @@
 ; CHECK-LABEL: test_vmull_u16:
 ; CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
   ret <4 x i32> %vmull2.i
 }
 
@@ -1366,7 +1366,7 @@
 ; CHECK-LABEL: test_vmull_u32:
 ; CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i64> %vmull2.i
 }
 
@@ -1376,7 +1376,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1386,7 +1386,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1396,7 +1396,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1406,7 +1406,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1416,7 +1416,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1426,7 +1426,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1434,7 +1434,7 @@
 ; CHECK-LABEL: test_vmlal_s8:
 ; CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %add.i = add <8 x i16> %vmull.i.i, %a
   ret <8 x i16> %add.i
 }
@@ -1443,7 +1443,7 @@
 ; CHECK-LABEL: test_vmlal_s16:
 ; CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %add.i = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add.i
 }
@@ -1452,7 +1452,7 @@
 ; CHECK-LABEL: test_vmlal_s32:
 ; CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %add.i = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add.i
 }
@@ -1461,7 +1461,7 @@
 ; CHECK-LABEL: test_vmlal_u8:
 ; CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %add.i = add <8 x i16> %vmull.i.i, %a
   ret <8 x i16> %add.i
 }
@@ -1470,7 +1470,7 @@
 ; CHECK-LABEL: test_vmlal_u16:
 ; CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %add.i = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add.i
 }
@@ -1479,7 +1479,7 @@
 ; CHECK-LABEL: test_vmlal_u32:
 ; CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %add.i = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add.i
 }
@@ -1490,7 +1490,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %add.i.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i.i
 }
@@ -1501,7 +1501,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
@@ -1512,7 +1512,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
@@ -1523,7 +1523,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %add.i.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i.i
 }
@@ -1534,7 +1534,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %add.i.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i.i
 }
@@ -1545,7 +1545,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %add.i.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i.i
 }
@@ -1554,7 +1554,7 @@
 ; CHECK-LABEL: test_vmlsl_s8:
 ; CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %sub.i = sub <8 x i16> %a, %vmull.i.i
   ret <8 x i16> %sub.i
 }
@@ -1563,7 +1563,7 @@
 ; CHECK-LABEL: test_vmlsl_s16:
 ; CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %sub.i = sub <4 x i32> %a, %vmull2.i.i
   ret <4 x i32> %sub.i
 }
@@ -1572,7 +1572,7 @@
 ; CHECK-LABEL: test_vmlsl_s32:
 ; CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %sub.i = sub <2 x i64> %a, %vmull2.i.i
   ret <2 x i64> %sub.i
 }
@@ -1581,7 +1581,7 @@
 ; CHECK-LABEL: test_vmlsl_u8:
 ; CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c)
   %sub.i = sub <8 x i16> %a, %vmull.i.i
   ret <8 x i16> %sub.i
 }
@@ -1590,7 +1590,7 @@
 ; CHECK-LABEL: test_vmlsl_u16:
 ; CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %c)
   %sub.i = sub <4 x i32> %a, %vmull2.i.i
   ret <4 x i32> %sub.i
 }
@@ -1599,7 +1599,7 @@
 ; CHECK-LABEL: test_vmlsl_u32:
 ; CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %c)
   %sub.i = sub <2 x i64> %a, %vmull2.i.i
   ret <2 x i64> %sub.i
 }
@@ -1610,7 +1610,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
   ret <8 x i16> %sub.i.i
 }
@@ -1621,7 +1621,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
@@ -1632,7 +1632,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
@@ -1643,7 +1643,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %c, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   %sub.i.i = sub <8 x i16> %a, %vmull.i.i.i
   ret <8 x i16> %sub.i.i
 }
@@ -1654,7 +1654,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i
   ret <4 x i32> %sub.i.i
 }
@@ -1665,7 +1665,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i
   ret <2 x i64> %sub.i.i
 }
@@ -1674,7 +1674,7 @@
 ; CHECK-LABEL: test_vqdmull_s16:
 ; CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vqdmull2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
+  %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %b)
   ret <4 x i32> %vqdmull2.i
 }
 
@@ -1682,7 +1682,7 @@
 ; CHECK-LABEL: test_vqdmull_s32:
 ; CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vqdmull2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
+  %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %b)
   ret <2 x i64> %vqdmull2.i
 }
 
@@ -1690,8 +1690,8 @@
 ; CHECK-LABEL: test_vqdmlal_s16:
 ; CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vqdmlal2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlal4.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlal4.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i)
   ret <4 x i32> %vqdmlal4.i
 }
 
@@ -1699,8 +1699,8 @@
 ; CHECK-LABEL: test_vqdmlal_s32:
 ; CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vqdmlal2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlal4.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
+  %vqdmlal2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlal4.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i)
   ret <2 x i64> %vqdmlal4.i
 }
 
@@ -1708,8 +1708,8 @@
 ; CHECK-LABEL: test_vqdmlsl_s16:
 ; CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 entry:
-  %vqdmlsl2.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-  %vqdmlsl4.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
+  %vqdmlsl4.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i)
   ret <4 x i32> %vqdmlsl4.i
 }
 
@@ -1717,8 +1717,8 @@
 ; CHECK-LABEL: test_vqdmlsl_s32:
 ; CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %vqdmlsl2.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-  %vqdmlsl4.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
+  %vqdmlsl2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
+  %vqdmlsl4.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i)
   ret <2 x i64> %vqdmlsl4.i
 }
 
@@ -1728,7 +1728,7 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
   ret <4 x i32> %vqdmull2.i.i
 }
 
@@ -1738,7 +1738,7 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
   ret <2 x i64> %vqdmull2.i.i
 }
 
@@ -1748,8 +1748,8 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlal2.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
+  %vqdmlal2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal2.i.i)
   ret <4 x i32> %vqdmlal4.i.i
 }
 
@@ -1759,8 +1759,8 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlal2.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlal4.i.i = tail call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
+  %vqdmlal2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlal4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal2.i.i)
   ret <2 x i64> %vqdmlal4.i.i
 }
 
@@ -1770,8 +1770,8 @@
 entry:
   %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.i3.i = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
+  %vqdmlsl2.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl2.i.i)
   ret <4 x i32> %vqdmlsl4.i.i
 }
 
@@ -1781,8 +1781,8 @@
 entry:
   %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.i3.i = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
-  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
+  %vqdmlsl2.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %shuffle.i3.i)
+  %vqdmlsl4.i.i = tail call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl2.i.i)
   ret <2 x i64> %vqdmlsl4.i.i
 }
 
@@ -1790,7 +1790,7 @@
 ; CHECK-LABEL: test_vmull_p8:
 ; CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 entry:
-  %vmull.i = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
+  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b)
   ret <8 x i16> %vmull.i
 }
 
@@ -1800,7 +1800,7 @@
 entry:
   %shuffle.i.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %shuffle.i3.i = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %shuffle.i.i, <8 x i8> %shuffle.i3.i)
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1808,7 +1808,7 @@
 ; CHECK-LABEL: test_vmull_p64
 ; CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
 entry:
-  %vmull2.i = tail call <16 x i8> @llvm.arm64.neon.pmull64(i64 %a, i64 %b)
+  %vmull2.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b)
   %vmull3.i = bitcast <16 x i8> %vmull2.i to i128
   ret i128 %vmull3.i
 }
@@ -1819,11 +1819,11 @@
 entry:
   %0 = extractelement <2 x i64> %a, i32 1
   %1 = extractelement <2 x i64> %b, i32 1
-  %vmull2.i.i = tail call <16 x i8> @llvm.arm64.neon.pmull64(i64 %0, i64 %1) #1
+  %vmull2.i.i = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %0, i64 %1) #1
   %vmull3.i.i = bitcast <16 x i8> %vmull2.i.i to i128
   ret i128 %vmull3.i.i
 }
 
-declare <16 x i8> @llvm.arm64.neon.pmull64(i64, i64) #5
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64) #5
 
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
new file mode 100644
index 0000000..6404ab7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-aba-abd.ll
@@ -0,0 +1,236 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uabd_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uabd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %abd
+}
+
+define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uaba_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %aba = add <8 x i8> %lhs, %abd
+; CHECK: uaba v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %aba
+}
+
+define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sabd_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sabd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %abd
+}
+
+define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_saba_v8i8:
+  %abd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %aba = add <8 x i8> %lhs, %abd
+; CHECK: saba v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %aba
+}
+
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uabd_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uabd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %abd
+}
+
+define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uaba_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %aba = add <16 x i8> %lhs, %abd
+; CHECK: uaba v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %aba
+}
+
+define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sabd_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sabd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %abd
+}
+
+define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_saba_v16i8:
+  %abd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %aba = add <16 x i8> %lhs, %abd
+; CHECK: saba v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %aba
+}
+
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uabd_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uabd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %abd
+}
+
+define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uaba_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %aba = add <4 x i16> %lhs, %abd
+; CHECK: uaba v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %aba
+}
+
+define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sabd_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sabd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %abd
+}
+
+define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_saba_v4i16:
+  %abd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %aba = add <4 x i16> %lhs, %abd
+; CHECK: saba v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %aba
+}
+
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uabd_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uabd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %abd
+}
+
+define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uaba_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %aba = add <8 x i16> %lhs, %abd
+; CHECK: uaba v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %aba
+}
+
+define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sabd_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sabd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %abd
+}
+
+define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_saba_v8i16:
+  %abd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %aba = add <8 x i16> %lhs, %abd
+; CHECK: saba v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %aba
+}
+
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uabd_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uabd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %abd
+}
+
+define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uaba_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %aba = add <2 x i32> %lhs, %abd
+; CHECK: uaba v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %aba
+}
+
+define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sabd_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sabd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %abd
+}
+
+define <2 x i32> @test_sabd_v2i32_const() {
+; CHECK: test_sabd_v2i32_const:
+; CHECK: movi     d1, #0x00ffffffff0000
+; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
+  %1 = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(
+    <2 x i32> <i32 -2147483648, i32 2147450880>,
+    <2 x i32> <i32 -65536, i32 65535>)
+  ret <2 x i32> %1
+}
+
+define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_saba_v2i32:
+  %abd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %aba = add <2 x i32> %lhs, %abd
+; CHECK: saba v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %aba
+}
+
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uabd_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uabd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %abd
+}
+
+define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uaba_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %aba = add <4 x i32> %lhs, %abd
+; CHECK: uaba v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %aba
+}
+
+define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sabd_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sabd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %abd
+}
+
+define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_saba_v4i32:
+  %abd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %aba = add <4 x i32> %lhs, %abd
+; CHECK: saba v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %aba
+}
+
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>)
+
+define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fabd_v2f32:
+  %abd = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fabd v0.2s, v0.2s, v1.2s
+  ret <2 x float> %abd
+}
+
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>)
+
+define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fabd_v4f32:
+  %abd = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fabd v0.4s, v0.4s, v1.4s
+  ret <4 x float> %abd
+}
+
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>)
+
+define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fabd_v2f64:
+  %abd = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fabd v0.2d, v0.2d, v1.2d
+  ret <2 x double> %abd
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
new file mode 100644
index 0000000..3a63673
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -0,0 +1,460 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
+
+declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
+
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
+
+define i16 @test_vaddlv_s8(<8 x i8> %a) {
+; CHECK: test_vaddlv_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %saddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_s16(<4 x i16> %a) {
+; CHECK: test_vaddlv_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %a)
+  ret i32 %saddlvv.i
+}
+
+define i16 @test_vaddlv_u8(<8 x i8> %a) {
+; CHECK: test_vaddlv_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %uaddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlv_u16(<4 x i16> %a) {
+; CHECK: test_vaddlv_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
+  ret i32 %uaddlvv.i
+}
+
+define i16 @test_vaddlvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_s8:
+; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %saddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_s16:
+; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %a)
+  ret i32 %saddlvv.i
+}
+
+define i64 @test_vaddlvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_s32:
+; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %saddlvv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %a)
+  ret i64 %saddlvv.i
+}
+
+define i16 @test_vaddlvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddlvq_u8:
+; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %uaddlvv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddlvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddlvq_u16:
+; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uaddlvv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
+  ret i32 %uaddlvv.i
+}
+
+define i64 @test_vaddlvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddlvq_u32:
+; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uaddlvv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %a)
+  ret i64 %uaddlvv.i
+}
+
+define i8 @test_vmaxv_s8(<8 x i8> %a) {
+; CHECK: test_vmaxv_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %smaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_s16(<4 x i16> %a) {
+; CHECK: test_vmaxv_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %smaxv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vmaxv_u8(<8 x i8> %a) {
+; CHECK: test_vmaxv_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %umaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxv_u16(<4 x i16> %a) {
+; CHECK: test_vmaxv_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %umaxv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vmaxvq_s8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_s8:
+; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %smaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_s16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_s16:
+; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %smaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_s32:
+; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %smaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a)
+  ret i32 %smaxv.i
+}
+
+define i8 @test_vmaxvq_u8(<16 x i8> %a) {
+; CHECK: test_vmaxvq_u8:
+; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %umaxv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vmaxvq_u16(<8 x i16> %a) {
+; CHECK: test_vmaxvq_u16:
+; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %umaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_u32(<4 x i32> %a) {
+; CHECK: test_vmaxvq_u32:
+; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %umaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> %a)
+  ret i32 %umaxv.i
+}
+
+define i8 @test_vminv_s8(<8 x i8> %a) {
+; CHECK: test_vminv_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %sminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminv_s16(<4 x i16> %a) {
+; CHECK: test_vminv_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %sminv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vminv_u8(<8 x i8> %a) {
+; CHECK: test_vminv_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %uminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminv_u16(<4 x i16> %a) {
+; CHECK: test_vminv_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %uminv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vminvq_s8(<16 x i8> %a) {
+; CHECK: test_vminvq_s8:
+; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %sminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminvq_s16(<8 x i16> %a) {
+; CHECK: test_vminvq_s16:
+; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %sminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a) {
+; CHECK: test_vminvq_s32:
+; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %sminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a)
+  ret i32 %sminv.i
+}
+
+define i8 @test_vminvq_u8(<16 x i8> %a) {
+; CHECK: test_vminvq_u8:
+; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %uminv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vminvq_u16(<8 x i16> %a) {
+; CHECK: test_vminvq_u16:
+; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %uminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_u32(<4 x i32> %a) {
+; CHECK: test_vminvq_u32:
+; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %uminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a)
+  ret i32 %uminv.i
+}
+
+define i8 @test_vaddv_s8(<8 x i8> %a) {
+; CHECK: test_vaddv_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddv_s16(<4 x i16> %a) {
+; CHECK: test_vaddv_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vaddv_u8(<8 x i8> %a) {
+; CHECK: test_vaddv_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddv_u16(<4 x i16> %a) {
+; CHECK: test_vaddv_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i8 @test_vaddvq_s8(<16 x i8> %a) {
+; CHECK: test_vaddvq_s8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_s16(<8 x i16> %a) {
+; CHECK: test_vaddvq_s16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a) {
+; CHECK: test_vaddvq_s32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
+  ret i32 %vaddv.i
+}
+
+define i8 @test_vaddvq_u8(<16 x i8> %a) {
+; CHECK: test_vaddvq_u8:
+; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i16 @test_vaddvq_u16(<8 x i16> %a) {
+; CHECK: test_vaddvq_u16:
+; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a) {
+; CHECK: test_vaddvq_u32:
+; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a)
+  ret i32 %vaddv.i
+}
+
+define float @test_vmaxvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxvq_f32:
+; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vminvq_f32(<4 x float> %a) {
+; CHECK: test_vminvq_f32:
+; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vmaxnmvq_f32(<4 x float> %a) {
+; CHECK: test_vmaxnmvq_f32:
+; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
+define float @test_vminnmvq_f32(<4 x float> %a) {
+; CHECK: test_vminnmvq_f32:
+; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
+entry:
+  %0 = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %a)
+  ret float %0
+}
+
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
new file mode 100644
index 0000000..d3dc1b8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll
@@ -0,0 +1,100 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_addp_v8i8:
+  %tmp1 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: addp v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_addp_v16i8:
+  %tmp1 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: addp v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_addp_v4i16:
+  %tmp1 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: addp v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_addp_v8i16:
+  %tmp1 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: addp v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_addp_v2i32:
+  %tmp1 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: addp v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_addp_v4i32:
+  %tmp1 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: addp v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_addp_v2i64:
+        %val = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: addp v0.2d, v0.2d, v1.2d
+        ret <2 x i64> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_faddp_v2f32:
+        %val = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: faddp v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_faddp_v4f32:
+        %val = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: faddp v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_faddp_v2f64:
+        %val = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: faddp v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
+define i32 @test_vaddv.v2i32(<2 x i32> %a) {
+; CHECK-LABEL: test_vaddv.v2i32
+; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  %1 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a)
+  ret i32 %1
+}
+
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-add-sub.ll b/llvm/test/CodeGen/AArch64/arm64-neon-add-sub.ll
similarity index 89%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-add-sub.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-add-sub.ll
index 241025e..fbde606 100644
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-add-sub.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-add-sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -arm64-simd-scalar| FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -aarch64-simd-scalar| FileCheck %s
 
 define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
 ;CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
@@ -182,35 +182,35 @@
 define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vabd_f64
 ; CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vmax_f64
 ; CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vmin_f64
 ; CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vmaxnm_f64
 ; CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) {
 ; CHECK-LABEL: test_vminnm_f64
 ; CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-  %1 = tail call <1 x double> @llvm.arm64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> %a, <1 x double> %b)
   ret <1 x double> %1
 }
 
@@ -229,9 +229,9 @@
 }
 
 declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
-declare <1 x double> @llvm.arm64.neon.fminnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fmaxnm.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fmin.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fmax.v1f64(<1 x double>, <1 x double>)
-declare <1 x double> @llvm.arm64.neon.fabd.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double>, <1 x double>)
+declare <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double>, <1 x double>)
 declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
diff --git a/llvm/test/CodeGen/ARM64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/neon-compare-instructions.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-copy.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 9493cad..cfc2ebf 100644
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1030,7 +1030,7 @@
 ; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
 ; CHECK-NEXT: ret
 entry:
-  %0 = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %a)
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
   %1 = insertelement <1 x float> undef, float %0, i32 0
   %2 = extractelement <1 x float> %1, i32 0
   %vecinit1.i = insertelement <2 x float> undef, float %2, i32 0
@@ -1042,14 +1042,14 @@
 ; CHECK: fmaxp s{{[0-9]+}}, v{{[0-9]+}}.2s
 ; CHECK-NEXT: ret
 entry:
-  %0 = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %a)
+  %0 = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> %a)
   %1 = insertelement <1 x float> undef, float %0, i32 0
   %2 = extractelement <1 x float> %1, i32 0
   %vecinit1.i = insertelement <4 x float> undef, float %2, i32 0
   ret <4 x float> %vecinit1.i
 }
 
-declare float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float>)
+declare float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float>)
 
 define <2 x i32> @test_concat_undef_v1i32(<2 x i32> %a) {
 ; CHECK-LABEL: test_concat_undef_v1i32:
@@ -1060,14 +1060,14 @@
   ret <2 x i32> %vecinit1.i
 }
 
-declare i32 @llvm.arm64.neon.sqabs.i32(i32) #4
+declare i32 @llvm.aarch64.neon.sqabs.i32(i32) #4
 
 define <2 x i32> @test_concat_v1i32_undef(i32 %a) {
 ; CHECK-LABEL: test_concat_v1i32_undef:
 ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
 ; CHECK-NEXT: ret
 entry:
-  %b = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %a)
+  %b = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
   %vecinit.i432 = insertelement <2 x i32> undef, i32 %b, i32 0
   ret <2 x i32> %vecinit.i432
 }
@@ -1088,9 +1088,9 @@
 ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}}
 ; CHECK-NEXT: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
-  %c = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %a)
+  %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a)
   %d = insertelement <2 x i32> undef, i32 %c, i32 0
-  %e = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %b)
+  %e = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %b)
   %f = insertelement <2 x i32> undef, i32 %e, i32 0
   %h = shufflevector <2 x i32> %d, <2 x i32> %f, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %h
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
new file mode 100644
index 0000000..276ac13
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copyPhysReg-tuple.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; arm64 has a separate copy due to intrinsics
+
+define <4 x i32> @copyTuple.QPair(i32* %a, i32* %b) {
+; CHECK-LABEL: copyTuple.QPair:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QTriple(i32* %a, i32* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QTriple:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+define <4 x i32> @copyTuple.QQuad(i32* %a, i32* %b, <4 x i32> %c) {
+; CHECK-LABEL: copyTuple.QQuad:
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
+entry:
+  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
+  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
+  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %b)
+  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
+  ret <4 x i32> %vld1.fca.0.extract
+}
+
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-mul-div.ll b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
similarity index 92%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-mul-div.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
index f3a9766..720f3eb 100644
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-mul-div.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-mul-div.ll
@@ -684,98 +684,98 @@
 	ret <2 x double> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.pmul.v8i8(<8 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.arm64.neon.pmul.v16i8(<16 x i8>, <16 x i8>)
+declare <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8>, <16 x i8>)
 
 define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
 ; CHECK-LABEL: poly_mulv8i8:
-   %prod = call <8 x i8> @llvm.arm64.neon.pmul.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+   %prod = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
 ; CHECK: pmul v0.8b, v0.8b, v1.8b
    ret <8 x i8> %prod
 }
 
 define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK-LABEL: poly_mulv16i8:
-   %prod = call <16 x i8> @llvm.arm64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+   %prod = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
 ; CHECK: pmul v0.16b, v0.16b, v1.16b
    ret <16 x i8> %prod
 }
 
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
 
 define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK-LABEL: test_sqdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
 ; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
    ret <4 x i16> %prod
 }
 
 define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK-LABEL: test_sqdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
 ; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
    ret <8 x i16> %prod
 }
 
 define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK-LABEL: test_sqdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
 ; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
    ret <2 x i32> %prod
 }
 
 define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: test_sqdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
 ; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
    ret <4 x i32> %prod
 }
 
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
 
 define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; CHECK-LABEL: test_sqrdmulh_v4i16:
-   %prod = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
 ; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
    ret <4 x i16> %prod
 }
 
 define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK-LABEL: test_sqrdmulh_v8i16:
-   %prod = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
 ; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
    ret <8 x i16> %prod
 }
 
 define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; CHECK-LABEL: test_sqrdmulh_v2i32:
-   %prod = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
 ; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
    ret <2 x i32> %prod
 }
 
 define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: test_sqrdmulh_v4i32:
-   %prod = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
 ; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
    ret <4 x i32> %prod
 }
 
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>)
 
 define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
 ; CHECK-LABEL: fmulx_v2f32:
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: fmulx v0.2s, v0.2s, v1.2s
-        %val = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        %val = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
         ret <2 x float> %val
 }
 
@@ -783,7 +783,7 @@
 ; CHECK-LABEL: fmulx_v4f32:
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: fmulx v0.4s, v0.4s, v1.4s
-        %val = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        %val = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
         ret <4 x float> %val
 }
 
@@ -791,7 +791,7 @@
 ; CHECK-LABEL: fmulx_v2f64:
 ; Using registers other than v0, v1 and v2 are possible, but would be odd.
 ; CHECK: fmulx v0.2d, v0.2d, v1.2d
-        %val = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        %val = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
         ret <2 x double> %val
 }
 
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-scalar-by-elem-mul.ll b/llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
similarity index 84%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-scalar-by-elem-mul.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
index 18e6e0f..92ed239 100644
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-scalar-by-elem-mul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-scalar-by-elem-mul.ll
@@ -61,13 +61,13 @@
   ret double %tmp2;
 }
 
-declare float @llvm.arm64.neon.fmulx.f32(float, float)
+declare float @llvm.aarch64.neon.fmulx.f32(float, float)
 
 define float @test_fmulx_lane_f32(float %a, <2 x float> %v) {
   ; CHECK-LABEL: test_fmulx_lane_f32
   ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
   %tmp1 = extractelement <2 x float> %v, i32 1
-  %tmp2 = call float @llvm.arm64.neon.fmulx.f32(float %a, float %tmp1)
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
   ret float %tmp2;
 }
 
@@ -75,7 +75,7 @@
   ; CHECK-LABEL: test_fmulx_laneq_f32
   ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.arm64.neon.fmulx.f32(float %a, float %tmp1)
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %tmp1)
   ret float %tmp2;
 }
 
@@ -83,17 +83,17 @@
   ; CHECK-LABEL: test_fmulx_laneq_f32_swap
   ; CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
   %tmp1 = extractelement <4 x float> %v, i32 3
-  %tmp2 = call float @llvm.arm64.neon.fmulx.f32(float %tmp1, float %a)
+  %tmp2 = call float @llvm.aarch64.neon.fmulx.f32(float %tmp1, float %a)
   ret float %tmp2;
 }
 
-declare double @llvm.arm64.neon.fmulx.f64(double, double)
+declare double @llvm.aarch64.neon.fmulx.f64(double, double)
 
 define double @test_fmulx_lane_f64(double %a, <1 x double> %v) {
   ; CHECK-LABEL: test_fmulx_lane_f64
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0]|d[0-9]+}}
   %tmp1 = extractelement <1 x double> %v, i32 0
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %a, double %tmp1)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
 }
 
@@ -101,7 +101,7 @@
   ; CHECK-LABEL: test_fmulx_laneq_f64_0
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
   %tmp1 = extractelement <2 x double> %v, i32 0
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %a, double %tmp1)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
 }
 
@@ -110,7 +110,7 @@
   ; CHECK-LABEL: test_fmulx_laneq_f64_1
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %a, double %tmp1)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %tmp1)
   ret double %tmp2;
 }
 
@@ -118,7 +118,7 @@
   ; CHECK-LABEL: test_fmulx_laneq_f64_1_swap
   ; CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
   %tmp1 = extractelement <2 x double> %v, i32 1
-  %tmp2 = call double @llvm.arm64.neon.fmulx.f64(double %tmp1, double %a)
+  %tmp2 = call double @llvm.aarch64.neon.fmulx.f64(double %tmp1, double %a)
   ret double %tmp2;
 }
 
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-select_cc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-select_cc.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-select_cc.ll
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-simd-ldst-one.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
similarity index 81%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
index 2fd2c1e..447fb63 100644
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-shift.ll
@@ -333,7 +333,7 @@
 define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqshrun_high_n_s16
 ; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrun = tail call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %b, i32 3)
+  %vqshrun = tail call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -344,7 +344,7 @@
 define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqshrun_high_n_s32
 ; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrun = tail call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %b, i32 9)
+  %vqshrun = tail call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -356,7 +356,7 @@
 ; CHECK: test_vqshrun_high_n_s64
 ; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrun = tail call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %b, i32 19)
+  %vqshrun = tail call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -366,7 +366,7 @@
 define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vrshrn_high_n_s16
 ; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vrshrn = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b, i32 3)
+  %vrshrn = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -377,7 +377,7 @@
 define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vrshrn_high_n_s32
 ; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vrshrn = tail call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %b, i32 9)
+  %vrshrn = tail call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -389,7 +389,7 @@
 ; CHECK: test_vrshrn_high_n_s64
 ; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vrshrn = tail call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %b, i32 19)
+  %vrshrn = tail call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -399,7 +399,7 @@
 define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqrshrun_high_n_s16
 ; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrun = tail call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %b, i32 3)
+  %vqrshrun = tail call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqrshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -410,7 +410,7 @@
 define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqrshrun_high_n_s32
 ; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrun = tail call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %b, i32 9)
+  %vqrshrun = tail call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqrshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -422,7 +422,7 @@
 ; CHECK: test_vqrshrun_high_n_s64
 ; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrun = tail call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %b, i32 19)
+  %vqrshrun = tail call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqrshrun to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -432,7 +432,7 @@
 define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqshrn_high_n_s16
 ; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %b, i32 3)
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -443,7 +443,7 @@
 define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqshrn_high_n_s32
 ; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %b, i32 9)
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -455,7 +455,7 @@
 ; CHECK: test_vqshrn_high_n_s64
 ; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %b, i32 19)
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -465,7 +465,7 @@
 define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqshrn_high_n_u16
 ; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqshrn = tail call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %b, i32 3)
+  %vqshrn = tail call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -476,7 +476,7 @@
 define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqshrn_high_n_u32
 ; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqshrn = tail call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %b, i32 9)
+  %vqshrn = tail call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -488,7 +488,7 @@
 ; CHECK: test_vqshrn_high_n_u64
 ; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqshrn = tail call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %b, i32 19)
+  %vqshrn = tail call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -498,7 +498,7 @@
 define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqrshrn_high_n_s16
 ; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -509,7 +509,7 @@
 define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqrshrn_high_n_s32
 ; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -521,7 +521,7 @@
 ; CHECK: test_vqrshrn_high_n_s64
 ; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -531,7 +531,7 @@
 define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) {
 ; CHECK: test_vqrshrn_high_n_u16
 ; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
-  %vqrshrn = tail call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %b, i32 3)
+  %vqrshrn = tail call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %b, i32 3)
   %1 = bitcast <8 x i8> %a to <1 x i64>
   %2 = bitcast <8 x i8> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -542,7 +542,7 @@
 define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) {
 ; CHECK: test_vqrshrn_high_n_u32
 ; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
-  %vqrshrn = tail call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %b, i32 9)
+  %vqrshrn = tail call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %b, i32 9)
   %1 = bitcast <4 x i16> %a to <1 x i64>
   %2 = bitcast <4 x i16> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -554,7 +554,7 @@
 ; CHECK: test_vqrshrn_high_n_u64
 ; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
   %1 = bitcast <2 x i32> %a to <1 x i64>
-  %vqrshrn = tail call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %b, i32 19)
+  %vqrshrn = tail call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %b, i32 19)
   %2 = bitcast <2 x i32> %vqrshrn to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
   %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
@@ -563,101 +563,101 @@
 
 
 
-declare <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32)
 
-declare <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32)
+declare <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32)
 
-declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32)
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32)
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32)
 
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32)
 
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32)
 
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32)
 
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32)
 
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32)
 
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32)
 
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32)
 
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32)
 
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32)
 
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32)
 
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32)
 
 define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvt_n_s64_f64
 ; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64)
   ret <1 x i64> %1
 }
 
 define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) {
 ; CHECK-LABEL: test_vcvt_n_u64_f64
 ; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x i64> @llvm.arm64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
+  %1 = tail call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64)
   ret <1 x i64> %1
 }
 
 define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) {
 ; CHECK-LABEL: test_vcvt_n_f64_s64
 ; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
   ret <1 x double> %1
 }
 
 define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) {
 ; CHECK-LABEL: test_vcvt_n_f64_u64
 ; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
-  %1 = tail call <1 x double> @llvm.arm64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
+  %1 = tail call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64)
   ret <1 x double> %1
 }
 
-declare <1 x i64> @llvm.arm64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
-declare <1 x i64> @llvm.arm64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
-declare <1 x double> @llvm.arm64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
-declare <1 x double> @llvm.arm64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32)
+declare <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32)
+declare <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32)
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-simd-vget.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-simd-vget.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-simd-vget.ll
diff --git a/llvm/test/CodeGen/ARM64/neon-v1i1-setcc.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/neon-v1i1-setcc.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-v1i1-setcc.ll
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll b/llvm/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
similarity index 77%
rename from llvm/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll
rename to llvm/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
index 9e69ac0..8262fe4 100644
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll
@@ -10,7 +10,7 @@
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %arg1)
+  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -30,7 +30,7 @@
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %arg1)
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -50,7 +50,7 @@
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %arg1)
+  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -70,7 +70,7 @@
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %arg1)
+  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -90,7 +90,7 @@
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %arg1)
+  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -110,7 +110,7 @@
 ; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 ; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}]
 entry:
-  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %arg1)
+  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8* %arg1)
   %cmp = icmp eq i32 %arg2, 0
   br i1 %cmp, label %if.then, label %if.end
 
@@ -124,12 +124,12 @@
   ret i8 %res
 }
 
-declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32*)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16*)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16*)
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32*)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float*)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8*)
+declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0i32(i32*)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0i16(i16*)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0i16(i16*)
+declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0i32(i32*)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0f32(float*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0i8(i8*)
 
 declare void @foo()
 
@@ -139,7 +139,7 @@
 ; then we can delete it.
 ; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
 define <8 x i16> @test_2xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
   tail call void @foo()
   %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
   %1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -150,7 +150,7 @@
 
 ; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
 define <8 x i16> @test_3xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
   tail call void @foo()
   %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
   %1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -161,7 +161,7 @@
 
 ; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
 define <8 x i16> @test_4xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) {
-  tail call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
+  tail call void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr)
   tail call void @foo()
   %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
   %1 = bitcast <2 x i64> %sv to <8 x i16>
@@ -170,6 +170,6 @@
   ret <8 x i16> %3
 }
 
-declare void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
-declare void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
-declare void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
+declare void @llvm.aarch64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)
diff --git a/llvm/test/CodeGen/ARM64/patchpoint.ll b/llvm/test/CodeGen/AArch64/arm64-patchpoint.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/patchpoint.ll
rename to llvm/test/CodeGen/AArch64/arm64-patchpoint.ll
diff --git a/llvm/test/CodeGen/ARM64/pic-local-symbol.ll b/llvm/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/pic-local-symbol.ll
rename to llvm/test/CodeGen/AArch64/arm64-pic-local-symbol.ll
diff --git a/llvm/test/CodeGen/ARM64/platform-reg.ll b/llvm/test/CodeGen/AArch64/arm64-platform-reg.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/platform-reg.ll
rename to llvm/test/CodeGen/AArch64/arm64-platform-reg.ll
diff --git a/llvm/test/CodeGen/ARM64/popcnt.ll b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
similarity index 92%
rename from llvm/test/CodeGen/ARM64/popcnt.ll
rename to llvm/test/CodeGen/AArch64/arm64-popcnt.ll
index 9bbba09..2afade2 100644
--- a/llvm/test/CodeGen/ARM64/popcnt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-popcnt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
   %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
diff --git a/llvm/test/CodeGen/ARM64/prefetch.ll b/llvm/test/CodeGen/AArch64/arm64-prefetch.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/prefetch.ll
rename to llvm/test/CodeGen/AArch64/arm64-prefetch.ll
diff --git a/llvm/test/CodeGen/ARM64/promote-const.ll b/llvm/test/CodeGen/AArch64/arm64-promote-const.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/promote-const.ll
rename to llvm/test/CodeGen/AArch64/arm64-promote-const.ll
index 9e7a215..380ff55 100644
--- a/llvm/test/CodeGen/ARM64/promote-const.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-promote-const.ll
@@ -1,9 +1,9 @@
 ; Disable machine cse to stress the different path of the algorithm.
 ; Otherwise, we always fall in the simple case, i.e., only one definition.
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-stress-promote-const -mcpu=cyclone | FileCheck -check-prefix=PROMOTED %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-stress-promote-const -mcpu=cyclone | FileCheck -check-prefix=PROMOTED %s
 ; The REGULAR run just checks that the inputs passed to promote const expose
 ; the appropriate patterns.
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-promote-const=false -mcpu=cyclone | FileCheck -check-prefix=REGULAR %s
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -aarch64-promote-const=false -mcpu=cyclone | FileCheck -check-prefix=REGULAR %s
 
 %struct.uint8x16x4_t = type { [4 x <16 x i8>] }
 
diff --git a/llvm/test/CodeGen/ARM64/redzone.ll b/llvm/test/CodeGen/AArch64/arm64-redzone.ll
similarity index 87%
rename from llvm/test/CodeGen/ARM64/redzone.ll
rename to llvm/test/CodeGen/AArch64/arm64-redzone.ll
index b89d7b1..9b0c384 100644
--- a/llvm/test/CodeGen/ARM64/redzone.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-redzone.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-redzone | FileCheck %s
 
 define i32 @foo(i32 %a, i32 %b) nounwind ssp {
 ; CHECK-LABEL: foo:
diff --git a/llvm/test/CodeGen/ARM64/reg-copy-noneon.ll b/llvm/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/reg-copy-noneon.ll
rename to llvm/test/CodeGen/AArch64/arm64-reg-copy-noneon.ll
diff --git a/llvm/test/CodeGen/ARM64/register-offset-addressing.ll b/llvm/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/register-offset-addressing.ll
rename to llvm/test/CodeGen/AArch64/arm64-register-offset-addressing.ll
diff --git a/llvm/test/CodeGen/ARM64/register-pairing.ll b/llvm/test/CodeGen/AArch64/arm64-register-pairing.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/register-pairing.ll
rename to llvm/test/CodeGen/AArch64/arm64-register-pairing.ll
diff --git a/llvm/test/CodeGen/ARM64/regress-f128csel-flags.ll b/llvm/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/regress-f128csel-flags.ll
rename to llvm/test/CodeGen/AArch64/arm64-regress-f128csel-flags.ll
diff --git a/llvm/test/CodeGen/ARM64/regress-interphase-shift.ll b/llvm/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/regress-interphase-shift.ll
rename to llvm/test/CodeGen/AArch64/arm64-regress-interphase-shift.ll
diff --git a/llvm/test/CodeGen/ARM64/return-vector.ll b/llvm/test/CodeGen/AArch64/arm64-return-vector.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/return-vector.ll
rename to llvm/test/CodeGen/AArch64/arm64-return-vector.ll
diff --git a/llvm/test/CodeGen/ARM64/returnaddr.ll b/llvm/test/CodeGen/AArch64/arm64-returnaddr.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/returnaddr.ll
rename to llvm/test/CodeGen/AArch64/arm64-returnaddr.ll
diff --git a/llvm/test/CodeGen/ARM64/rev.ll b/llvm/test/CodeGen/AArch64/arm64-rev.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/rev.ll
rename to llvm/test/CodeGen/AArch64/arm64-rev.ll
index 1da59e4..30d9f4f 100644
--- a/llvm/test/CodeGen/ARM64/rev.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rev.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @test_rev_w(i32 %a) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/ARM64/rounding.ll b/llvm/test/CodeGen/AArch64/arm64-rounding.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/rounding.ll
rename to llvm/test/CodeGen/AArch64/arm64-rounding.ll
diff --git a/llvm/test/CodeGen/ARM64/scaled_iv.ll b/llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/scaled_iv.ll
rename to llvm/test/CodeGen/AArch64/arm64-scaled_iv.ll
diff --git a/llvm/test/CodeGen/ARM64/scvt.ll b/llvm/test/CodeGen/AArch64/arm64-scvt.ll
similarity index 99%
rename from llvm/test/CodeGen/ARM64/scvt.ll
rename to llvm/test/CodeGen/AArch64/arm64-scvt.ll
index b4d4add..2e006cf 100644
--- a/llvm/test/CodeGen/ARM64/scvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-scvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 ; rdar://13082402
 
 define float @t1(i32* nocapture %src) nounwind ssp {
diff --git a/llvm/test/CodeGen/ARM64/shifted-sext.ll b/llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/shifted-sext.ll
rename to llvm/test/CodeGen/AArch64/arm64-shifted-sext.ll
diff --git a/llvm/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll b/llvm/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
new file mode 100644
index 0000000..aed39e7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-simd-scalar-to-vector.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+
+define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
+; CHECK: uaddlv.16b h0, v0
+; CHECK: rshrn.8b v0, v0, #4
+; CHECK: dup.16b v0, v0[0]
+; CHECK: ret
+
+; CHECK-FAST: uaddlv.16b
+; CHECK-FAST: rshrn.8b
+; CHECK-FAST: dup.16b
+  %tmp = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp1 = trunc i32 %tmp to i16
+  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
+  %tmp3 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
+  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp4
+}
+
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/simplest-elf.ll b/llvm/test/CodeGen/AArch64/arm64-simplest-elf.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/simplest-elf.ll
rename to llvm/test/CodeGen/AArch64/arm64-simplest-elf.ll
diff --git a/llvm/test/CodeGen/ARM64/sincos.ll b/llvm/test/CodeGen/AArch64/arm64-sincos.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/sincos.ll
rename to llvm/test/CodeGen/AArch64/arm64-sincos.ll
diff --git a/llvm/test/CodeGen/ARM64/sitofp-combine-chains.ll b/llvm/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/sitofp-combine-chains.ll
rename to llvm/test/CodeGen/AArch64/arm64-sitofp-combine-chains.ll
diff --git a/llvm/test/CodeGen/ARM64/sli-sri-opt.ll b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
similarity index 94%
rename from llvm/test/CodeGen/ARM64/sli-sri-opt.ll
rename to llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
index 725dcd5..7fec539 100644
--- a/llvm/test/CodeGen/ARM64/sli-sri-opt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -arm64-shift-insert-generation=true -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -aarch64-shift-insert-generation=true -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
 ; CHECK-LABEL: testLeftGood:
diff --git a/llvm/test/CodeGen/ARM64/smaxv.ll b/llvm/test/CodeGen/AArch64/arm64-smaxv.ll
similarity index 61%
rename from llvm/test/CodeGen/ARM64/smaxv.ll
rename to llvm/test/CodeGen/AArch64/arm64-smaxv.ll
index 4f6e01b..183e667 100644
--- a/llvm/test/CodeGen/ARM64/smaxv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-smaxv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
 ; CHECK: test_vmaxv_s8
@@ -6,7 +6,7 @@
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vmaxv.i to i8
   ret i8 %0
 }
@@ -17,7 +17,7 @@
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vmaxv.i to i16
   ret i16 %0
 }
@@ -29,7 +29,7 @@
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vmaxv.i
 }
 
@@ -39,7 +39,7 @@
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vmaxv.i to i8
   ret i8 %0
 }
@@ -50,7 +50,7 @@
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vmaxv.i to i16
   ret i16 %0
 }
@@ -61,14 +61,14 @@
 ; CHECK-NEXT: fmov w0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vmaxv.i
 }
 
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
 
diff --git a/llvm/test/CodeGen/ARM64/sminv.ll b/llvm/test/CodeGen/AArch64/arm64-sminv.ll
similarity index 61%
rename from llvm/test/CodeGen/ARM64/sminv.ll
rename to llvm/test/CodeGen/AArch64/arm64-sminv.ll
index a246868..195c4e5 100644
--- a/llvm/test/CodeGen/ARM64/sminv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-sminv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define signext i8 @test_vminv_s8(<8 x i8> %a1) {
 ; CHECK: test_vminv_s8
@@ -6,7 +6,7 @@
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vminv.i to i8
   ret i8 %0
 }
@@ -17,7 +17,7 @@
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vminv.i to i16
   ret i16 %0
 }
@@ -29,7 +29,7 @@
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vminv.i
 }
 
@@ -39,7 +39,7 @@
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vminv.i to i8
   ret i8 %0
 }
@@ -50,7 +50,7 @@
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vminv.i to i16
   ret i16 %0
 }
@@ -61,14 +61,14 @@
 ; CHECK-NEXT: fmov w0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a1)
+  %vminv.i = tail call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vminv.i
 }
 
-declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
-declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
-declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
-declare i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32>)
-declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
-declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
 
diff --git a/llvm/test/CodeGen/ARM64/spill-lr.ll b/llvm/test/CodeGen/AArch64/arm64-spill-lr.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/spill-lr.ll
rename to llvm/test/CodeGen/AArch64/arm64-spill-lr.ll
diff --git a/llvm/test/CodeGen/ARM64/spill.ll b/llvm/test/CodeGen/AArch64/arm64-spill.ll
similarity index 88%
rename from llvm/test/CodeGen/ARM64/spill.ll
rename to llvm/test/CodeGen/AArch64/arm64-spill.ll
index 5a5da97..47cdc2b 100644
--- a/llvm/test/CodeGen/ARM64/spill.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-spill.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -arm64-neon-syntax=apple -verify-machineinstrs
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -aarch64-neon-syntax=apple -verify-machineinstrs
 
 ; CHECK: fpr128
 ; CHECK: ld1.2d
diff --git a/llvm/test/CodeGen/AArch64/arm64-st1.ll b/llvm/test/CodeGen/AArch64/arm64-st1.ll
new file mode 100644
index 0000000..4370484
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-st1.ll
@@ -0,0 +1,676 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+define void @st1lane_16b(<16 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_16b
+; CHECK: st1.b
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_8h(<8 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_8h
+; CHECK: st1.h
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_4s(<4 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_4s
+; CHECK: st1.s
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_4s_float(<4 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_4s_float
+; CHECK: st1.s
+  %tmp = extractelement <4 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
+define void @st1lane_2d(<2 x i64> %A, i64* %D) {
+; CHECK-LABEL: st1lane_2d
+; CHECK: st1.d
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, i64* %D
+  ret void
+}
+
+define void @st1lane_2d_double(<2 x double> %A, double* %D) {
+; CHECK-LABEL: st1lane_2d_double
+; CHECK: st1.d
+  %tmp = extractelement <2 x double> %A, i32 1
+  store double %tmp, double* %D
+  ret void
+}
+
+define void @st1lane_8b(<8 x i8> %A, i8* %D) {
+; CHECK-LABEL: st1lane_8b
+; CHECK: st1.b
+  %tmp = extractelement <8 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_4h(<4 x i16> %A, i16* %D) {
+; CHECK-LABEL: st1lane_4h
+; CHECK: st1.h
+  %tmp = extractelement <4 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_2s(<2 x i32> %A, i32* %D) {
+; CHECK-LABEL: st1lane_2s
+; CHECK: st1.s
+  %tmp = extractelement <2 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_2s_float(<2 x float> %A, float* %D) {
+; CHECK-LABEL: st1lane_2s_float
+; CHECK: st1.s
+  %tmp = extractelement <2 x float> %A, i32 1
+  store float %tmp, float* %D
+  ret void
+}
+
+define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
+; CHECK-LABEL: st2lane_16b
+; CHECK: st2.b
+  call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
+  ret void
+}
+
+define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
+; CHECK-LABEL: st2lane_8h
+; CHECK: st2.h
+  call void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
+  ret void
+}
+
+define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
+; CHECK-LABEL: st2lane_4s
+; CHECK: st2.s
+  call void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
+  ret void
+}
+
+define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
+; CHECK-LABEL: st2lane_2d
+; CHECK: st2.d
+  call void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
+; CHECK-LABEL: st3lane_16b
+; CHECK: st3.b
+  call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
+  ret void
+}
+
+define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
+; CHECK-LABEL: st3lane_8h
+; CHECK: st3.h
+  call void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
+  ret void
+}
+
+define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
+; CHECK-LABEL: st3lane_4s
+; CHECK: st3.s
+  call void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
+  ret void
+}
+
+define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
+; CHECK-LABEL: st3lane_2d
+; CHECK: st3.d
+  call void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
+; CHECK-LABEL: st4lane_16b
+; CHECK: st4.b
+  call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
+  ret void
+}
+
+define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
+; CHECK-LABEL: st4lane_8h
+; CHECK: st4.h
+  call void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
+  ret void
+}
+
+define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
+; CHECK-LABEL: st4lane_4s
+; CHECK: st4.s
+  call void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
+  ret void
+}
+
+define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
+; CHECK-LABEL: st4lane_2d
+; CHECK: st4.d
+  call void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.aarch64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+
+define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
+; CHECK-LABEL: st2_8b
+; CHECK st2.8b
+	call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
+; CHECK-LABEL: st3_8b
+; CHECK st3.8b
+	call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
+; CHECK-LABEL: st4_8b
+; CHECK st4.8b
+	call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+
+define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
+; CHECK-LABEL: st2_16b
+; CHECK st2.16b
+	call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
+; CHECK-LABEL: st3_16b
+; CHECK st3.16b
+	call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
+; CHECK-LABEL: st4_16b
+; CHECK st4.16b
+	call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+
+define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
+; CHECK-LABEL: st2_4h
+; CHECK st2.4h
+	call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
+; CHECK-LABEL: st3_4h
+; CHECK st3.4h
+	call void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
+; CHECK-LABEL: st4_4h
+; CHECK st4.4h
+	call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+
+define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
+; CHECK-LABEL: st2_8h
+; CHECK st2.8h
+	call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
+; CHECK-LABEL: st3_8h
+; CHECK st3.8h
+	call void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
+; CHECK-LABEL: st4_8h
+; CHECK st4.8h
+	call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+
+define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
+; CHECK-LABEL: st2_2s
+; CHECK st2.2s
+	call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
+; CHECK-LABEL: st3_2s
+; CHECK st3.2s
+	call void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
+; CHECK-LABEL: st4_2s
+; CHECK st4.2s
+	call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+
+define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
+; CHECK-LABEL: st2_4s
+; CHECK st2.4s
+	call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
+; CHECK-LABEL: st3_4s
+; CHECK st3.4s
+	call void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
+; CHECK-LABEL: st4_4s
+; CHECK st4.4s
+	call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+
+define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
+; CHECK-LABEL: st2_1d
+; CHECK st1.2d
+	call void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
+; CHECK-LABEL: st3_1d
+; CHECK st1.3d
+	call void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
+; CHECK-LABEL: st4_1d
+; CHECK st1.4d
+	call void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+
+define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
+; CHECK-LABEL: st2_2d
+; CHECK st2.2d
+	call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
+; CHECK-LABEL: st3_2d
+; CHECK st2.3d
+	call void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
+; CHECK-LABEL: st4_2d
+; CHECK st2.4d
+	call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+
+declare void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
+  ret void
+}
+
+
+declare void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
+  ret void
+}
+
+declare void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
+  ret void
+}
diff --git a/llvm/test/CodeGen/ARM64/stack-no-frame.ll b/llvm/test/CodeGen/AArch64/arm64-stack-no-frame.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/stack-no-frame.ll
rename to llvm/test/CodeGen/AArch64/arm64-stack-no-frame.ll
diff --git a/llvm/test/CodeGen/ARM64/stackmap.ll b/llvm/test/CodeGen/AArch64/arm64-stackmap.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/stackmap.ll
rename to llvm/test/CodeGen/AArch64/arm64-stackmap.ll
diff --git a/llvm/test/CodeGen/ARM64/stackpointer.ll b/llvm/test/CodeGen/AArch64/arm64-stackpointer.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/stackpointer.ll
rename to llvm/test/CodeGen/AArch64/arm64-stackpointer.ll
diff --git a/llvm/test/CodeGen/ARM64/stacksave.ll b/llvm/test/CodeGen/AArch64/arm64-stacksave.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/stacksave.ll
rename to llvm/test/CodeGen/AArch64/arm64-stacksave.ll
diff --git a/llvm/test/CodeGen/ARM64/stp.ll b/llvm/test/CodeGen/AArch64/arm64-stp.ll
similarity index 94%
rename from llvm/test/CodeGen/ARM64/stp.ll
rename to llvm/test/CodeGen/AArch64/arm64-stp.ll
index 3a58396..40bdf22 100644
--- a/llvm/test/CodeGen/ARM64/stp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-stp.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -arm64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN: llc < %s -march=arm64 -aarch64-stp-suppress=false -verify-machineinstrs -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-unscaled-mem-op=true\
 ; RUN:   -verify-machineinstrs -mcpu=cyclone | FileCheck -check-prefix=STUR_CHK %s
 
 ; CHECK: stp_int
diff --git a/llvm/test/CodeGen/ARM64/strict-align.ll b/llvm/test/CodeGen/AArch64/arm64-strict-align.ll
similarity index 75%
rename from llvm/test/CodeGen/ARM64/strict-align.ll
rename to llvm/test/CodeGen/AArch64/arm64-strict-align.ll
index 48a1528..5d13704 100644
--- a/llvm/test/CodeGen/ARM64/strict-align.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-strict-align.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-no-strict-align | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-no-strict-align | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -aarch64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
 
 define i32 @f0(i32* nocapture %p) nounwind {
 ; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
diff --git a/llvm/test/CodeGen/ARM64/stur.ll b/llvm/test/CodeGen/AArch64/arm64-stur.ll
similarity index 96%
rename from llvm/test/CodeGen/ARM64/stur.ll
rename to llvm/test/CodeGen/AArch64/arm64-stur.ll
index dc67b60..a2e684d 100644
--- a/llvm/test/CodeGen/ARM64/stur.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-stur.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
 %struct.X = type <{ i32, i64, i64 }>
 
 define void @foo1(i32* %p, i64 %val) nounwind {
diff --git a/llvm/test/CodeGen/ARM64/subsections.ll b/llvm/test/CodeGen/AArch64/arm64-subsections.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/subsections.ll
rename to llvm/test/CodeGen/AArch64/arm64-subsections.ll
diff --git a/llvm/test/CodeGen/ARM64/subvector-extend.ll b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
similarity index 96%
rename from llvm/test/CodeGen/ARM64/subvector-extend.ll
rename to llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
index ad2f06c..d5a178a 100644
--- a/llvm/test/CodeGen/ARM64/subvector-extend.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-subvector-extend.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 ; Test efficient codegen of vector extends up from legal type to 128 bit
 ; and 256 bit vector types.
diff --git a/llvm/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll b/llvm/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
rename to llvm/test/CodeGen/AArch64/arm64-swizzle-tbl-i16-layout.ll
diff --git a/llvm/test/CodeGen/AArch64/arm64-tbl.ll b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
new file mode 100644
index 0000000..b1ce15a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-tbl.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK: tbl1_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK: tbl1_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK: tbl2_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK: tbl2_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbl3_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbl3_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbl4_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbl4_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK: tbx1_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK: tbx1_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbx2_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbx2_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbx3_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbx3_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK: tbx4_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK: tbx4_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
diff --git a/llvm/test/CodeGen/ARM64/this-return.ll b/llvm/test/CodeGen/AArch64/arm64-this-return.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/this-return.ll
rename to llvm/test/CodeGen/AArch64/arm64-this-return.ll
diff --git a/llvm/test/CodeGen/ARM64/tls-darwin.ll b/llvm/test/CodeGen/AArch64/arm64-tls-darwin.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/tls-darwin.ll
rename to llvm/test/CodeGen/AArch64/arm64-tls-darwin.ll
diff --git a/llvm/test/CodeGen/ARM64/tls-dynamic-together.ll b/llvm/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/tls-dynamic-together.ll
rename to llvm/test/CodeGen/AArch64/arm64-tls-dynamic-together.ll
diff --git a/llvm/test/CodeGen/ARM64/tls-dynamics.ll b/llvm/test/CodeGen/AArch64/arm64-tls-dynamics.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/tls-dynamics.ll
rename to llvm/test/CodeGen/AArch64/arm64-tls-dynamics.ll
diff --git a/llvm/test/CodeGen/ARM64/tls-execs.ll b/llvm/test/CodeGen/AArch64/arm64-tls-execs.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/tls-execs.ll
rename to llvm/test/CodeGen/AArch64/arm64-tls-execs.ll
diff --git a/llvm/test/CodeGen/ARM64/trap.ll b/llvm/test/CodeGen/AArch64/arm64-trap.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/trap.ll
rename to llvm/test/CodeGen/AArch64/arm64-trap.ll
diff --git a/llvm/test/CodeGen/ARM64/trn.ll b/llvm/test/CodeGen/AArch64/arm64-trn.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/trn.ll
rename to llvm/test/CodeGen/AArch64/arm64-trn.ll
index f467984..2db7a14 100644
--- a/llvm/test/CodeGen/ARM64/trn.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-trn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vtrni8:
diff --git a/llvm/test/CodeGen/ARM64/trunc-store.ll b/llvm/test/CodeGen/AArch64/arm64-trunc-store.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/trunc-store.ll
rename to llvm/test/CodeGen/AArch64/arm64-trunc-store.ll
diff --git a/llvm/test/CodeGen/ARM64/umaxv.ll b/llvm/test/CodeGen/AArch64/arm64-umaxv.ll
similarity index 75%
rename from llvm/test/CodeGen/ARM64/umaxv.ll
rename to llvm/test/CodeGen/AArch64/arm64-umaxv.ll
index 15277d3..d523f31 100644
--- a/llvm/test/CodeGen/ARM64/umaxv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-umaxv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-LABEL: vmax_u8x8:
@@ -7,7 +7,7 @@
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -30,7 +30,7 @@
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -51,7 +51,7 @@
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -72,7 +72,7 @@
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
   %tmp = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -86,7 +86,7 @@
   ret i32 %retval.0
 }
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/uminv.ll b/llvm/test/CodeGen/AArch64/arm64-uminv.ll
similarity index 75%
rename from llvm/test/CodeGen/ARM64/uminv.ll
rename to llvm/test/CodeGen/AArch64/arm64-uminv.ll
index 440522f..3bade4b 100644
--- a/llvm/test/CodeGen/ARM64/uminv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-uminv.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
 ; CHECK-LABEL: vmin_u8x8:
@@ -7,7 +7,7 @@
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
   %tmp = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -30,7 +30,7 @@
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
   %tmp = trunc i32 %vminv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -51,7 +51,7 @@
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
   %tmp = trunc i32 %vminv.i to i16
   %tobool = icmp eq i16 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -72,7 +72,7 @@
 ; CHECK-NOT: and
 ; CHECK: cbz     [[REG2]],
 entry:
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
   %tmp = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %tmp, 0
   br i1 %tobool, label %return, label %if.then
@@ -86,7 +86,7 @@
   ret i32 %retval.0
 }
 
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/umov.ll b/llvm/test/CodeGen/AArch64/arm64-umov.ll
similarity index 89%
rename from llvm/test/CodeGen/ARM64/umov.ll
rename to llvm/test/CodeGen/AArch64/arm64-umov.ll
index 19fd91b..a1ef990 100644
--- a/llvm/test/CodeGen/ARM64/umov.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-umov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define zeroext i8 @f1(<16 x i8> %a) {
 ; CHECK-LABEL: f1:
diff --git a/llvm/test/CodeGen/ARM64/unaligned_ldst.ll b/llvm/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/unaligned_ldst.ll
rename to llvm/test/CodeGen/AArch64/arm64-unaligned_ldst.ll
diff --git a/llvm/test/CodeGen/ARM64/uzp.ll b/llvm/test/CodeGen/AArch64/arm64-uzp.ll
similarity index 97%
rename from llvm/test/CodeGen/ARM64/uzp.ll
rename to llvm/test/CodeGen/AArch64/arm64-uzp.ll
index 60e16d0..cdd8d31 100644
--- a/llvm/test/CodeGen/ARM64/uzp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-uzp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vuzpi8:
diff --git a/llvm/test/CodeGen/ARM64/vaargs.ll b/llvm/test/CodeGen/AArch64/arm64-vaargs.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/vaargs.ll
rename to llvm/test/CodeGen/AArch64/arm64-vaargs.ll
diff --git a/llvm/test/CodeGen/ARM64/vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
similarity index 68%
rename from llvm/test/CodeGen/ARM64/vabs.ll
rename to llvm/test/CodeGen/AArch64/arm64-vabs.ll
index 0d8aa24..5afc8d9 100644
--- a/llvm/test/CodeGen/ARM64/vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 
 define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@@ -6,7 +6,7 @@
 ;CHECK: sabdl.8h
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
         ret <8 x i16> %tmp4
 }
@@ -16,7 +16,7 @@
 ;CHECK: sabdl.4s
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
         ret <4 x i32> %tmp4
 }
@@ -26,7 +26,7 @@
 ;CHECK: sabdl.2d
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
         ret <2 x i64> %tmp4
 }
@@ -38,7 +38,7 @@
         %load2 = load <16 x i8>* %B
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
         ret <8 x i16> %tmp4
 }
@@ -50,7 +50,7 @@
         %load2 = load <8 x i16>* %B
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
         ret <4 x i32> %tmp4
 }
@@ -62,7 +62,7 @@
         %load2 = load <4 x i32>* %B
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
         ret <2 x i64> %tmp4
 }
@@ -72,7 +72,7 @@
 ;CHECK: uabdl.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
   %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
   ret <8 x i16> %tmp4
 }
@@ -82,7 +82,7 @@
 ;CHECK: uabdl.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
   ret <4 x i32> %tmp4
 }
@@ -92,7 +92,7 @@
 ;CHECK: uabdl.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
   ret <2 x i64> %tmp4
 }
@@ -105,7 +105,7 @@
   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
   %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
   ret <8 x i16> %tmp4
 }
@@ -117,7 +117,7 @@
   %load2 = load <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
   ret <4 x i32> %tmp4
 }
@@ -129,7 +129,7 @@
   %load2 = load <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
   ret <2 x i64> %tmp4
 }
@@ -139,7 +139,7 @@
 ;CHECK: fabd.2s
         %tmp1 = load <2 x float>* %A
         %tmp2 = load <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
         ret <2 x float> %tmp3
 }
 
@@ -148,7 +148,7 @@
 ;CHECK: fabd.4s
         %tmp1 = load <4 x float>* %A
         %tmp2 = load <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
         ret <4 x float> %tmp3
 }
 
@@ -157,20 +157,20 @@
 ;CHECK: fabd.2d
         %tmp1 = load <2 x double>* %A
         %tmp2 = load <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
         ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sabd_8b:
 ;CHECK: sabd.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -179,7 +179,7 @@
 ;CHECK: sabd.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -188,7 +188,7 @@
 ;CHECK: sabd.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -197,7 +197,7 @@
 ;CHECK: sabd.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -206,7 +206,7 @@
 ;CHECK: sabd.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -215,23 +215,23 @@
 ;CHECK: sabd.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: uabd_8b:
 ;CHECK: uabd.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -240,7 +240,7 @@
 ;CHECK: uabd.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -249,7 +249,7 @@
 ;CHECK: uabd.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -258,7 +258,7 @@
 ;CHECK: uabd.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -267,7 +267,7 @@
 ;CHECK: uabd.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -276,22 +276,22 @@
 ;CHECK: uabd.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
 
 define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqabs_8b:
 ;CHECK: sqabs.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %tmp1)
         ret <8 x i8> %tmp3
 }
 
@@ -299,7 +299,7 @@
 ;CHECK-LABEL: sqabs_16b:
 ;CHECK: sqabs.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %tmp1)
         ret <16 x i8> %tmp3
 }
 
@@ -307,7 +307,7 @@
 ;CHECK-LABEL: sqabs_4h:
 ;CHECK: sqabs.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -315,7 +315,7 @@
 ;CHECK-LABEL: sqabs_8h:
 ;CHECK: sqabs.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -323,7 +323,7 @@
 ;CHECK-LABEL: sqabs_2s:
 ;CHECK: sqabs.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -331,22 +331,22 @@
 ;CHECK-LABEL: sqabs_4s:
 ;CHECK: sqabs.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> %tmp1)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
 
 define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sqneg_8b:
 ;CHECK: sqneg.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %tmp1)
         ret <8 x i8> %tmp3
 }
 
@@ -354,7 +354,7 @@
 ;CHECK-LABEL: sqneg_16b:
 ;CHECK: sqneg.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %tmp1)
         ret <16 x i8> %tmp3
 }
 
@@ -362,7 +362,7 @@
 ;CHECK-LABEL: sqneg_4h:
 ;CHECK: sqneg.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -370,7 +370,7 @@
 ;CHECK-LABEL: sqneg_8h:
 ;CHECK: sqneg.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -378,7 +378,7 @@
 ;CHECK-LABEL: sqneg_2s:
 ;CHECK: sqneg.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -386,22 +386,22 @@
 ;CHECK-LABEL: sqneg_4s:
 ;CHECK: sqneg.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> %tmp1)
         ret <4 x i32> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
 
 define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: abs_8b:
 ;CHECK: abs.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %tmp1)
         ret <8 x i8> %tmp3
 }
 
@@ -409,7 +409,7 @@
 ;CHECK-LABEL: abs_16b:
 ;CHECK: abs.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %tmp1)
         ret <16 x i8> %tmp3
 }
 
@@ -417,7 +417,7 @@
 ;CHECK-LABEL: abs_4h:
 ;CHECK: abs.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -425,7 +425,7 @@
 ;CHECK-LABEL: abs_8h:
 ;CHECK: abs.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -433,7 +433,7 @@
 ;CHECK-LABEL: abs_2s:
 ;CHECK: abs.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -441,32 +441,32 @@
 ;CHECK-LABEL: abs_4s:
 ;CHECK: abs.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> %tmp1)
         ret <4 x i32> %tmp3
 }
 
 define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
 ; CHECK-LABEL: abs_1d:
 ; CHECK: abs d0, d0
-  %abs = call <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64> %A)
+  %abs = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> %A)
   ret <1 x i64> %abs
 }
 
 define i64 @abs_1d_honestly(i64 %A) nounwind {
 ; CHECK-LABEL: abs_1d_honestly:
 ; CHECK: abs d0, d0
-  %abs = call i64 @llvm.arm64.neon.abs.i64(i64 %A)
+  %abs = call i64 @llvm.aarch64.neon.abs.i64(i64 %A)
   ret i64 %abs
 }
 
-declare <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.abs.i64(i64) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.abs.i64(i64) nounwind readnone
 
 define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
 ;CHECK-LABEL: sabal8h:
@@ -474,7 +474,7 @@
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
         %tmp3 = load <8 x i16>* %C
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -486,7 +486,7 @@
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
         %tmp3 = load <4 x i32>* %C
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -498,7 +498,7 @@
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
         %tmp3 = load <2 x i64>* %C
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
@@ -513,7 +513,7 @@
         %tmp3 = load <8 x i16>* %C
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -527,7 +527,7 @@
         %tmp3 = load <4 x i32>* %C
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -541,7 +541,7 @@
         %tmp3 = load <2 x i64>* %C
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
         ret <2 x i64> %tmp5
@@ -553,7 +553,7 @@
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
         %tmp3 = load <8 x i16>* %C
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -565,7 +565,7 @@
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
         %tmp3 = load <4 x i32>* %C
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -577,7 +577,7 @@
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
         %tmp3 = load <2 x i64>* %C
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
         ret <2 x i64> %tmp5
@@ -591,7 +591,7 @@
         %tmp3 = load <8 x i16>* %C
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
         %tmp5 = add <8 x i16> %tmp3, %tmp4.1
         ret <8 x i16> %tmp5
@@ -605,7 +605,7 @@
         %tmp3 = load <4 x i32>* %C
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
         %tmp5 = add <4 x i32> %tmp3, %tmp4.1
         ret <4 x i32> %tmp5
@@ -619,7 +619,7 @@
         %tmp3 = load <2 x i64>* %C
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
         %tmp5 = add <2 x i64> %tmp3, %tmp4.1
         ret <2 x i64> %tmp5
@@ -630,7 +630,7 @@
 ;CHECK: saba.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = load <8 x i8>* %C
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -641,7 +641,7 @@
 ;CHECK: saba.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         %tmp4 = load <16 x i8>* %C
         %tmp5 = add <16 x i8> %tmp3, %tmp4
         ret <16 x i8> %tmp5
@@ -652,7 +652,7 @@
 ;CHECK: saba.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = load <4 x i16>* %C
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -663,7 +663,7 @@
 ;CHECK: saba.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         %tmp4 = load <8 x i16>* %C
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -674,7 +674,7 @@
 ;CHECK: saba.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = load <2 x i32>* %C
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -685,7 +685,7 @@
 ;CHECK: saba.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         %tmp4 = load <4 x i32>* %C
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -696,7 +696,7 @@
 ;CHECK: uaba.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         %tmp4 = load <8 x i8>* %C
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -707,7 +707,7 @@
 ;CHECK: uaba.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         %tmp4 = load <16 x i8>* %C
         %tmp5 = add <16 x i8> %tmp3, %tmp4
         ret <16 x i8> %tmp5
@@ -718,7 +718,7 @@
 ;CHECK: uaba.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         %tmp4 = load <4 x i16>* %C
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -729,7 +729,7 @@
 ;CHECK: uaba.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         %tmp4 = load <8 x i16>* %C
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -740,7 +740,7 @@
 ;CHECK: uaba.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         %tmp4 = load <2 x i32>* %C
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -751,7 +751,7 @@
 ;CHECK: uaba.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         %tmp4 = load <4 x i32>* %C
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -761,19 +761,19 @@
 define float @fabds(float %a, float %b) nounwind {
 ; CHECK-LABEL: fabds:
 ; CHECK: fabd s0, s0, s1
-  %vabd.i = tail call float @llvm.arm64.sisd.fabd.f32(float %a, float %b) nounwind
+  %vabd.i = tail call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) nounwind
   ret float %vabd.i
 }
 
 define double @fabdd(double %a, double %b) nounwind {
 ; CHECK-LABEL: fabdd:
 ; CHECK: fabd d0, d0, d1
-  %vabd.i = tail call double @llvm.arm64.sisd.fabd.f64(double %a, double %b) nounwind
+  %vabd.i = tail call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) nounwind
   ret double %vabd.i
 }
 
-declare double @llvm.arm64.sisd.fabd.f64(double, double) nounwind readnone
-declare float @llvm.arm64.sisd.fabd.f32(float, float) nounwind readnone
+declare double @llvm.aarch64.sisd.fabd.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.sisd.fabd.f32(float, float) nounwind readnone
 
 define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uabdl_from_extract_dup:
@@ -784,7 +784,7 @@
 
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res = tail call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
   %res1 = zext <2 x i32> %res to <2 x i64>
   ret <2 x i64> %res1
 }
@@ -798,7 +798,7 @@
 
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res = tail call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
   %res1 = zext <2 x i32> %res to <2 x i64>
   ret <2 x i64> %res1
 }
diff --git a/llvm/test/CodeGen/ARM64/vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
similarity index 80%
rename from llvm/test/CodeGen/ARM64/vadd.ll
rename to llvm/test/CodeGen/AArch64/arm64-vadd.ll
index f674c6d..9ed8aa6 100644
--- a/llvm/test/CodeGen/ARM64/vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 
 define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: addhn8b:
 ;CHECK: addhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@
 ;CHECK: addhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@
 ;CHECK: addhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -31,8 +31,8 @@
 ;CHECK-LABEL: addhn2_16b:
 ;CHECK: addhn.8b
 ;CHECK-NEXT: addhn2.16b
-  %vaddhn2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vaddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -41,8 +41,8 @@
 ;CHECK-LABEL: addhn2_8h:
 ;CHECK: addhn.4h
 ;CHECK-NEXT: addhn2.8h
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vaddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -51,15 +51,15 @@
 ;CHECK-LABEL: addhn2_4s:
 ;CHECK: addhn.2s
 ;CHECK-NEXT: addhn2.4s
-  %vaddhn2.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vaddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 
 define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
@@ -67,7 +67,7 @@
 ;CHECK: raddhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -76,7 +76,7 @@
 ;CHECK: raddhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -85,7 +85,7 @@
 ;CHECK: raddhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -93,8 +93,8 @@
 ;CHECK-LABEL: raddhn2_16b:
 ;CHECK: raddhn.8b
 ;CHECK-NEXT: raddhn2.16b
-  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vraddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -103,8 +103,8 @@
 ;CHECK-LABEL: raddhn2_8h:
 ;CHECK: raddhn.4h
 ;CHECK-NEXT: raddhn2.8h
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vraddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -113,15 +113,15 @@
 ;CHECK-LABEL: raddhn2_4s:
 ;CHECK: raddhn.2s
 ;CHECK-NEXT: raddhn2.4s
-  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vraddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: saddl8h:
@@ -428,7 +428,7 @@
 ;CHECK-LABEL: saddlp4h:
 ;CHECK: saddlp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -436,7 +436,7 @@
 ;CHECK-LABEL: saddlp2s:
 ;CHECK: saddlp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -444,7 +444,7 @@
 ;CHECK-LABEL: saddlp1d:
 ;CHECK: saddlp.1d
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
         ret <1 x i64> %tmp3
 }
 
@@ -452,7 +452,7 @@
 ;CHECK-LABEL: saddlp8h:
 ;CHECK: saddlp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -460,7 +460,7 @@
 ;CHECK-LABEL: saddlp4s:
 ;CHECK: saddlp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
         ret <4 x i32> %tmp3
 }
 
@@ -468,23 +468,23 @@
 ;CHECK-LABEL: saddlp2d:
 ;CHECK: saddlp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
         ret <2 x i64> %tmp3
 }
 
-declare <4 x i16>  @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i16>  @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
 
-declare <8 x i16>  @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 
 define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: uaddlp4h:
 ;CHECK: uaddlp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
         ret <4 x i16> %tmp3
 }
 
@@ -492,7 +492,7 @@
 ;CHECK-LABEL: uaddlp2s:
 ;CHECK: uaddlp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
         ret <2 x i32> %tmp3
 }
 
@@ -500,7 +500,7 @@
 ;CHECK-LABEL: uaddlp1d:
 ;CHECK: uaddlp.1d
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
         ret <1 x i64> %tmp3
 }
 
@@ -508,7 +508,7 @@
 ;CHECK-LABEL: uaddlp8h:
 ;CHECK: uaddlp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
         ret <8 x i16> %tmp3
 }
 
@@ -516,7 +516,7 @@
 ;CHECK-LABEL: uaddlp4s:
 ;CHECK: uaddlp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
         ret <4 x i32> %tmp3
 }
 
@@ -524,23 +524,23 @@
 ;CHECK-LABEL: uaddlp2d:
 ;CHECK: uaddlp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
         ret <2 x i64> %tmp3
 }
 
-declare <4 x i16>  @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
 
-declare <8 x i16>  @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 
 define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sadalp4h:
 ;CHECK: sadalp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -550,7 +550,7 @@
 ;CHECK-LABEL: sadalp2s:
 ;CHECK: sadalp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -560,7 +560,7 @@
 ;CHECK-LABEL: sadalp8h:
 ;CHECK: sadalp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -570,7 +570,7 @@
 ;CHECK-LABEL: sadalp4s:
 ;CHECK: sadalp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -580,7 +580,7 @@
 ;CHECK-LABEL: sadalp2d:
 ;CHECK: sadalp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
         ret <2 x i64> %tmp5
@@ -590,7 +590,7 @@
 ;CHECK-LABEL: uadalp4h:
 ;CHECK: uadalp.4h
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -600,7 +600,7 @@
 ;CHECK-LABEL: uadalp2s:
 ;CHECK: uadalp.2s
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -610,7 +610,7 @@
 ;CHECK-LABEL: uadalp8h:
 ;CHECK: uadalp.8h
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
         ret <8 x i16> %tmp5
@@ -620,7 +620,7 @@
 ;CHECK-LABEL: uadalp4s:
 ;CHECK: uadalp.4s
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
         ret <4 x i32> %tmp5
@@ -630,7 +630,7 @@
 ;CHECK-LABEL: uadalp2d:
 ;CHECK: uadalp.2d
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
         ret <2 x i64> %tmp5
@@ -641,7 +641,7 @@
 ;CHECK: addp.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -650,7 +650,7 @@
 ;CHECK: addp.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -659,7 +659,7 @@
 ;CHECK: addp.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -668,7 +668,7 @@
 ;CHECK: addp.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -677,7 +677,7 @@
 ;CHECK: addp.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -686,7 +686,7 @@
 ;CHECK: addp.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -695,24 +695,24 @@
 ;CHECK: addp.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: faddp_2s:
 ;CHECK: faddp.2s
         %tmp1 = load <2 x float>* %A
         %tmp2 = load <2 x float>* %B
-        %tmp3 = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        %tmp3 = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
         ret <2 x float> %tmp3
 }
 
@@ -721,7 +721,7 @@
 ;CHECK: faddp.4s
         %tmp1 = load <4 x float>* %A
         %tmp2 = load <4 x float>* %B
-        %tmp3 = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        %tmp3 = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
         ret <4 x float> %tmp3
 }
 
@@ -730,13 +730,13 @@
 ;CHECK: faddp.2d
         %tmp1 = load <2 x double>* %A
         %tmp2 = load <2 x double>* %B
-        %tmp3 = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        %tmp3 = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
         ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 ; CHECK-LABEL: uaddl2_duprhs
diff --git a/llvm/test/CodeGen/AArch64/arm64-vaddlv.ll b/llvm/test/CodeGen/AArch64/arm64-vaddlv.ll
new file mode 100644
index 0000000..2d64138
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vaddlv.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
+
+define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_s32
+; CHECK: saddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_u32
+; CHECK: uaddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+declare i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
+declare i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
diff --git a/llvm/test/CodeGen/ARM64/vaddv.ll b/llvm/test/CodeGen/AArch64/arm64-vaddv.ll
similarity index 63%
rename from llvm/test/CodeGen/ARM64/vaddv.ll
rename to llvm/test/CodeGen/AArch64/arm64-vaddv.ll
index 154f917..2d92ce6 100644
--- a/llvm/test/CodeGen/ARM64/vaddv.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vaddv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
 
 define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
 ; CHECK-LABEL: test_vaddv_s8:
@@ -6,7 +6,7 @@
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -17,7 +17,7 @@
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -29,7 +29,7 @@
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vaddv.i
 }
 
@@ -39,7 +39,7 @@
 ; CHECK-NEXT: fmov x0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64> %a1)
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1)
   ret i64 %vaddv.i
 }
 
@@ -49,7 +49,7 @@
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -60,7 +60,7 @@
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
   %0 = and i32 %vaddv.i, 511 ; 0x1ff
   ret i32 %0
 }
@@ -71,7 +71,7 @@
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -82,7 +82,7 @@
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
   %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
   ret i32 %0
 }
@@ -94,7 +94,7 @@
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
   ret i32 %vaddv.i
 }
 
@@ -103,7 +103,7 @@
 ; CHECK: faddp.2s s0, v0
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
+  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
   ret float %vaddv.i
 }
 
@@ -113,7 +113,7 @@
 ; CHECK: faddp.2s s0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
+  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
   ret float %vaddv.i
 }
 
@@ -122,7 +122,7 @@
 ; CHECK: faddp.2d d0, v0
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
+  %vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
   ret double %vaddv.i
 }
 
@@ -132,7 +132,7 @@
 ; CHECK-NEXT: fmov x0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
   ret i64 %vaddv.i
 }
 
@@ -143,7 +143,7 @@
 ; CHECK-NOT: ins
 ; CHECK: ret
 entry:
-  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
   %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
   ret <1 x i64> %vec
 }
@@ -154,7 +154,7 @@
 ; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -165,7 +165,7 @@
 ; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -176,7 +176,7 @@
 ; CHECK-NEXT: fmov w0, [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vaddv.i
 }
 
@@ -186,7 +186,7 @@
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
   %0 = trunc i32 %vaddv.i to i8
   ret i8 %0
 }
@@ -197,7 +197,7 @@
 ; CHECK-NEXT: fmov w0, s[[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
   %0 = trunc i32 %vaddv.i to i16
   ret i16 %0
 }
@@ -208,38 +208,38 @@
 ; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
 ; CHECK-NEXT: ret
 entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
+  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
   ret i32 %vaddv.i
 }
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
 
-declare i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64>)
+declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
 
-declare i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64>)
+declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
 
-declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
+declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
 
-declare float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
-declare float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
-declare double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
+declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
+declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
+declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
diff --git a/llvm/test/CodeGen/ARM64/variadic-aapcs.ll b/llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/variadic-aapcs.ll
rename to llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
diff --git a/llvm/test/CodeGen/ARM64/vbitwise.ll b/llvm/test/CodeGen/AArch64/arm64-vbitwise.ll
similarity index 86%
rename from llvm/test/CodeGen/ARM64/vbitwise.ll
rename to llvm/test/CodeGen/AArch64/arm64-vbitwise.ll
index 7d8378d..93de95e 100644
--- a/llvm/test/CodeGen/ARM64/vbitwise.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vbitwise.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: rbit_8b:
 ;CHECK: rbit.8b
 	%tmp1 = load <8 x i8>* %A
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8> %tmp1)
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %tmp1)
 	ret <8 x i8> %tmp3
 }
 
@@ -12,12 +12,12 @@
 ;CHECK-LABEL: rbit_16b:
 ;CHECK: rbit.16b
 	%tmp1 = load <16 x i8>* %A
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8> %tmp1)
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %tmp1)
 	ret <16 x i8> %tmp3
 }
 
-declare <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
 
 define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: sxtl8h:
diff --git a/llvm/test/CodeGen/ARM64/vclz.ll b/llvm/test/CodeGen/AArch64/arm64-vclz.ll
similarity index 97%
rename from llvm/test/CodeGen/ARM64/vclz.ll
rename to llvm/test/CodeGen/AArch64/arm64-vclz.ll
index ddc09ed..cf5670a0 100644
--- a/llvm/test/CodeGen/ARM64/vclz.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vclz.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
   ; CHECK-LABEL: test_vclz_u8:
diff --git a/llvm/test/CodeGen/ARM64/vcmp.ll b/llvm/test/CodeGen/AArch64/arm64-vcmp.ll
similarity index 76%
rename from llvm/test/CodeGen/ARM64/vcmp.ll
rename to llvm/test/CodeGen/AArch64/arm64-vcmp.ll
index 56153f0..982ab09 100644
--- a/llvm/test/CodeGen/ARM64/vcmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 
 define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
@@ -18,7 +18,7 @@
 ;CHECK: facge.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -27,7 +27,7 @@
 ;CHECK: facge.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -36,20 +36,20 @@
 ;CHECK: facge.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: facgt_2s:
 ;CHECK: facgt.2s
 	%tmp1 = load <2 x float>* %A
 	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -58,7 +58,7 @@
 ;CHECK: facgt.4s
 	%tmp1 = load <4 x float>* %A
 	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -67,47 +67,47 @@
 ;CHECK: facgt.2d
 	%tmp1 = load <2 x double>* %A
 	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define i32 @facge_s(float %A, float %B) nounwind {
 ; CHECK-LABEL: facge_s:
 ; CHECK: facge {{s[0-9]+}}, s0, s1
-  %mask = call i32 @llvm.arm64.neon.facge.i32.f32(float %A, float %B)
+  %mask = call i32 @llvm.aarch64.neon.facge.i32.f32(float %A, float %B)
   ret i32 %mask
 }
 
 define i64 @facge_d(double %A, double %B) nounwind {
 ; CHECK-LABEL: facge_d:
 ; CHECK: facge {{d[0-9]+}}, d0, d1
-  %mask = call i64 @llvm.arm64.neon.facge.i64.f64(double %A, double %B)
+  %mask = call i64 @llvm.aarch64.neon.facge.i64.f64(double %A, double %B)
   ret i64 %mask
 }
 
-declare i64 @llvm.arm64.neon.facge.i64.f64(double, double)
-declare i32 @llvm.arm64.neon.facge.i32.f32(float, float)
+declare i64 @llvm.aarch64.neon.facge.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facge.i32.f32(float, float)
 
 define i32 @facgt_s(float %A, float %B) nounwind {
 ; CHECK-LABEL: facgt_s:
 ; CHECK: facgt {{s[0-9]+}}, s0, s1
-  %mask = call i32 @llvm.arm64.neon.facgt.i32.f32(float %A, float %B)
+  %mask = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %A, float %B)
   ret i32 %mask
 }
 
 define i64 @facgt_d(double %A, double %B) nounwind {
 ; CHECK-LABEL: facgt_d:
 ; CHECK: facgt {{d[0-9]+}}, d0, d1
-  %mask = call i64 @llvm.arm64.neon.facgt.i64.f64(double %A, double %B)
+  %mask = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %A, double %B)
   ret i64 %mask
 }
 
-declare i64 @llvm.arm64.neon.facgt.i64.f64(double, double)
-declare i32 @llvm.arm64.neon.facgt.i32.f32(float, float)
+declare i64 @llvm.aarch64.neon.facgt.i64.f64(double, double)
+declare i32 @llvm.aarch64.neon.facgt.i32.f32(float, float)
 
 define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: cmtst_8b:
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcnt.ll b/llvm/test/CodeGen/AArch64/arm64-vcnt.ll
new file mode 100644
index 0000000..903501e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vcnt.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_8b:
+;CHECK: cls.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_16b:
+;CHECK: cls.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_4h:
+;CHECK: cls.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_8h:
+;CHECK: cls.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_2s:
+;CHECK: cls.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_4s:
+;CHECK: cls.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vcombine.ll b/llvm/test/CodeGen/AArch64/arm64-vcombine.ll
similarity index 90%
rename from llvm/test/CodeGen/ARM64/vcombine.ll
rename to llvm/test/CodeGen/AArch64/arm64-vcombine.ll
index 16f591e..fa12996 100644
--- a/llvm/test/CodeGen/ARM64/vcombine.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcombine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ; LowerCONCAT_VECTORS() was reversing the order of two parts.
 ; rdar://11558157
diff --git a/llvm/test/CodeGen/ARM64/vcvt.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
similarity index 66%
rename from llvm/test/CodeGen/ARM64/vcvt.ll
rename to llvm/test/CodeGen/AArch64/arm64-vcvt.ll
index 19bb8cb..8c9e4e9 100644
--- a/llvm/test/CodeGen/ARM64/vcvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtas_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtas.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -14,7 +14,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtas.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -23,20 +23,20 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtas.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtau_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtau.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -45,7 +45,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtau.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -54,20 +54,20 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtau.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtms_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtms.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -76,7 +76,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtms.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -85,20 +85,20 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtms.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtmu_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtmu.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -107,7 +107,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtmu.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -116,20 +116,20 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtmu.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtps_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtps.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -138,7 +138,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtps.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -147,20 +147,20 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtps.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtpu_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtpu.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -169,7 +169,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtpu.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -178,20 +178,20 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtpu.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtns_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtns.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -200,7 +200,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtns.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -209,20 +209,20 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtns.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtnu_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtnu.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
 	ret <2 x i32> %tmp3
 }
 
@@ -231,7 +231,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtnu.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
 	ret <4 x i32> %tmp3
 }
 
@@ -240,13 +240,13 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtnu.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtzs_2s:
@@ -401,7 +401,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: frintn.2s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float> %A)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> %A)
 	ret <2 x float> %tmp3
 }
 
@@ -410,7 +410,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: frintn.4s v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float> %A)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> %A)
 	ret <4 x float> %tmp3
 }
 
@@ -419,13 +419,13 @@
 ;CHECK-NOT: ld1
 ;CHECK: frintn.2d v0, v0
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> %A)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double>) nounwind readnone
 
 define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: frintp_2s:
@@ -525,7 +525,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtxn v0.2s, v0.2d
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
 	ret <2 x float> %tmp3
 }
 
@@ -534,19 +534,19 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtxn2 v0.4s, v1.2d
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
         %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 	ret <4 x float> %res
 }
 
-declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
 
 define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtzsc_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzs.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
 	ret <2 x i32> %tmp3
 }
 
@@ -555,7 +555,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzs.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
 	ret <4 x i32> %tmp3
 }
 
@@ -564,20 +564,20 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzs.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
 
 define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
 ;CHECK-LABEL: fcvtzuc_2s:
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzu.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
 	ret <2 x i32> %tmp3
 }
 
@@ -586,7 +586,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzu.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
 	ret <4 x i32> %tmp3
 }
 
@@ -595,20 +595,20 @@
 ;CHECK-NOT: ld1
 ;CHECK: fcvtzu.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
 	ret <2 x i64> %tmp3
 }
 
-declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
 
 define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
 ;CHECK-LABEL: scvtf_2sc:
 ;CHECK-NOT: ld1
 ;CHECK: scvtf.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
 	ret <2 x float> %tmp3
 }
 
@@ -617,7 +617,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: scvtf.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
 	ret <4 x float> %tmp3
 }
 
@@ -626,20 +626,20 @@
 ;CHECK-NOT: ld1
 ;CHECK: scvtf.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
 	ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
 
 define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
 ;CHECK-LABEL: ucvtf_2sc:
 ;CHECK-NOT: ld1
 ;CHECK: ucvtf.2s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
 	ret <2 x float> %tmp3
 }
 
@@ -648,7 +648,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: ucvtf.4s v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
 	ret <4 x float> %tmp3
 }
 
@@ -657,7 +657,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: ucvtf.2d v0, v0, #1
 ;CHECK-NEXT: ret
-	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
 	ret <2 x double> %tmp3
 }
 
@@ -681,6 +681,6 @@
   ret void
 }
 
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
similarity index 72%
rename from llvm/test/CodeGen/ARM64/vcvt_f.ll
rename to llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index d67aa3b..d244958 100644
--- a/llvm/test/CodeGen/ARM64/vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-; RUN: llc < %s -O0 -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -O0 -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvt_f64_f32:
@@ -38,7 +38,7 @@
 
 define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvtx_f32_f64:
-  %vcvtx1.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
 ; CHECK: fcvtxn
   ret <2 x float> %vcvtx1.i
 ; CHECK: ret
@@ -46,7 +46,7 @@
 
 define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
 ; CHECK-LABEL: test_vcvtx_high_f32_f64:
-  %vcvtx2.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
   %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK: fcvtxn2
   ret <4 x float> %res
@@ -54,13 +54,13 @@
 }
 
 
-declare <2 x double> @llvm.arm64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfp2df(<2 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone
 
-declare <2 x float> @llvm.arm64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
 
-declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
 
 define i16 @to_half(float %in) {
 ; CHECK-LABEL: to_half:
diff --git a/llvm/test/CodeGen/ARM64/vcvt_f32_su32.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
similarity index 74%
rename from llvm/test/CodeGen/ARM64/vcvt_f32_su32.ll
rename to llvm/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
index 51e053d..1eb7b43 100644
--- a/llvm/test/CodeGen/ARM64/vcvt_f32_su32.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f32_su32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
 ; CHECK-LABEL: ucvt:
@@ -37,7 +37,7 @@
 ; CHECK-LABEL: cvtf16:
 ; CHECK: fcvtl  v0.4s, v0.4h
 ; CHECK-NEXT: ret
-  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %a) nounwind
+  %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %a) nounwind
   ret <4 x float> %vcvt1.i
 }
 
@@ -46,7 +46,7 @@
 ; CHECK: fcvtl2  v0.4s, v0.8h
 ; CHECK-NEXT: ret
   %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %in) nounwind
+  %vcvt1.i = tail call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> %in) nounwind
   ret <4 x float> %vcvt1.i
 }
 
@@ -56,7 +56,7 @@
 ; CHECK-LABEL: cvtf16f32:
 ; CHECK: fcvtn  v0.4h, v0.4s
 ; CHECK-NEXT: ret
-  %vcvt1.i = tail call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %a) nounwind
+  %vcvt1.i = tail call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %a) nounwind
   ret <4 x i16> %vcvt1.i
 }
 
@@ -64,10 +64,10 @@
 ; CHECK-LABEL: cvtf16f32_high:
 ; CHECK: fcvtn2 v0.8h, v1.4s
 ; CHECK-NEXT: ret
-  %high = call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %high_big)
+  %high = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> %high_big)
   %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
 
-declare <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_n.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_n.ll
new file mode 100644
index 0000000..7ed5be6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_n.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxpu:
+; CHECK: ucvtf.2s	v0, v0, #9
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
+  ret <2 x float> %vcvt_n1
+}
+
+define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxps:
+; CHECK: scvtf.2s	v0, v0, #12
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
+  ret <2 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxpu:
+; CHECK: ucvtf.4s	v0, v0, #18
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
+  ret <4 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxps:
+; CHECK: scvtf.4s	v0, v0, #30
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
+  ret <4 x float> %vcvt_n1
+}
+define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
+  ret <2 x double> %vcvt_n1
+}
+
+define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
+  ret <2 x double> %vcvt_n1
+}
+
+declare <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vcvt_su32_f32.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
similarity index 90%
rename from llvm/test/CodeGen/ARM64/vcvt_su32_f32.ll
rename to llvm/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
index 8c82fa0..985a5f7 100644
--- a/llvm/test/CodeGen/ARM64/vcvt_su32_f32.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_su32_f32.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
 ; CHECK: c1
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll b/llvm/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
new file mode 100644
index 0000000..b29c22c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvtxd_f32_f64.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @fcvtxn(double %a) {
+; CHECK-LABEL: fcvtxn:
+; CHECK: fcvtxn s0, d0
+; CHECK-NEXT: ret
+  %vcvtxd.i = tail call float @llvm.aarch64.sisd.fcvtxn(double %a) nounwind
+  ret float %vcvtxd.i
+}
+
+declare float @llvm.aarch64.sisd.fcvtxn(double) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vecCmpBr.ll b/llvm/test/CodeGen/AArch64/arm64-vecCmpBr.ll
similarity index 87%
rename from llvm/test/CodeGen/ARM64/vecCmpBr.ll
rename to llvm/test/CodeGen/AArch64/arm64-vecCmpBr.ll
index 2af8775..c7321e4 100644
--- a/llvm/test/CodeGen/ARM64/vecCmpBr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vecCmpBr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
 ; ModuleID = 'arm64_vecCmpBr.c'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
 target triple = "arm64-apple-ios3.0.0"
@@ -13,7 +13,7 @@
 ; CHECK-NEXT: b _bar
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -39,7 +39,7 @@
 
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -63,7 +63,7 @@
 
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -86,7 +86,7 @@
 ; CHECK-NEXT: movz w0, #0
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -109,7 +109,7 @@
 ; CHECK-NEXT: b _bar
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -132,7 +132,7 @@
 ; CHECK-NEXT: b _bar
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %vmaxv.i = tail call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vmaxv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %if.then, label %return
@@ -155,7 +155,7 @@
 ; CHECK-NEXT: movz w0, #0
 entry:
   %0 = bitcast <4 x i16> %a to <8 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -178,7 +178,7 @@
 ; CHECK-NEXT: movz w0, #0
 entry:
   %0 = bitcast <8 x i16> %a to <16 x i8>
-  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
   %1 = trunc i32 %vminv.i to i8
   %tobool = icmp eq i8 %1, 0
   br i1 %tobool, label %return, label %if.then
@@ -192,13 +192,13 @@
   ret i32 %retval.0
 }
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) #2
+declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>) #2
 
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) #2
+declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>) #2
 
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) #2
+declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) #2
 
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) #2
+declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) #2
 
 attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
 attributes #1 = { "target-cpu"="cyclone" }
diff --git a/llvm/test/CodeGen/ARM64/vecFold.ll b/llvm/test/CodeGen/AArch64/arm64-vecFold.ll
similarity index 74%
rename from llvm/test/CodeGen/ARM64/vecFold.ll
rename to llvm/test/CodeGen/AArch64/arm64-vecFold.ll
index 6888932f..aeacfcc 100644
--- a/llvm/test/CodeGen/ARM64/vecFold.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vecFold.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s| FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple -o - %s| FileCheck %s
 
 define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
 ; CHECK-LABEL: foov16i8:
@@ -50,8 +50,8 @@
 
 define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
 ; CHECK-LABEL: bar:
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vaddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
 ; CHECK: addhn.4h	v0, v0, v1
 ; CHECK-NEXT: addhn2.8h	v0, v2, v3
 ; CHECK-NEXT: ret
@@ -64,7 +64,7 @@
 
 define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
 ; CHECK-LABEL: baz:
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
   %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
   %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
 ; CHECK: addhn.4h	v0, v0, v1
@@ -83,8 +83,8 @@
 ; CHECK: 	raddhn.4h	v0, v0, v1
 ; CHECK-NEXT: 	raddhn2.8h	v0, v2, v3
 ; CHECK-NEXT: 	ret
-  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
-  %vraddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vraddhn2.i10 = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
   %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
   %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
@@ -97,8 +97,8 @@
 ; CHECK: rshrn.8b	v0, v0, #5
 ; CHECK-NEXT: rshrn2.16b	v0, v2, #6
 ; CHECK-NEXT: ret
-  %vrshrn_n1 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
-  %vrshrn_n4 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
+  %vrshrn_n1 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
+  %vrshrn_n4 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
   %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
   %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -111,8 +111,8 @@
 ; CHECK: rsubhn.8b	v0, v0, v1
 ; CHECK: rsubhn2.16b	v0, v2, v3
 ; CHECK-NEXT: 	ret
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
-  %vrsubhn2.i10 = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
+  %vrsubhn2.i10 = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
   %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
   %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
   %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
@@ -122,8 +122,8 @@
 
 define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
 ; CHECK-LABEL: noOpt1:
-  %vqsub2.i = tail call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
-  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %vqsub2.i = tail call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
 ; CHECK:	sqsub.2s	v0, v0, v1
 ; CHECK-NEXT:	addhn2.8h	v0, v2, v3
   %1 = bitcast <2 x i32> %vqsub2.i to <1 x i64>
@@ -133,13 +133,13 @@
   ret <8 x i16> %3
 }
 
-declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
 
-declare <8 x i8> @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
diff --git a/llvm/test/CodeGen/ARM64/vector-ext.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ext.ll
similarity index 80%
rename from llvm/test/CodeGen/ARM64/vector-ext.ll
rename to llvm/test/CodeGen/AArch64/arm64-vector-ext.ll
index 9cc0555..650ff1e 100644
--- a/llvm/test/CodeGen/ARM64/vector-ext.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ;CHECK: @func30
 ;CHECK: ushll.4s  v0, v0, #0
diff --git a/llvm/test/CodeGen/ARM64/vector-imm.ll b/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll
similarity index 97%
rename from llvm/test/CodeGen/ARM64/vector-imm.ll
rename to llvm/test/CodeGen/AArch64/arm64-vector-imm.ll
index a84f804..9fb088b 100644
--- a/llvm/test/CodeGen/ARM64/vector-imm.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind {
 ; CHECK-LABEL: v_orrimm:
diff --git a/llvm/test/CodeGen/ARM64/vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
similarity index 91%
rename from llvm/test/CodeGen/ARM64/vector-insertion.ll
rename to llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
index 0926bcf..8fbff71 100644
--- a/llvm/test/CodeGen/ARM64/vector-insertion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -mcpu=generic -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -mcpu=generic -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define void @test0f(float* nocapture %x, float %a) #0 {
 entry:
diff --git a/llvm/test/CodeGen/ARM64/vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
similarity index 99%
rename from llvm/test/CodeGen/ARM64/vector-ldst.ll
rename to llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
index 154160e..c001915 100644
--- a/llvm/test/CodeGen/ARM64/vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ldst.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
 
 ; rdar://9428579
 
diff --git a/llvm/test/CodeGen/ARM64/vext.ll b/llvm/test/CodeGen/AArch64/arm64-vext.ll
similarity index 99%
rename from llvm/test/CodeGen/ARM64/vext.ll
rename to llvm/test/CodeGen/AArch64/arm64-vext.ll
index c820439..2240dfd 100644
--- a/llvm/test/CodeGen/ARM64/vext.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vext.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 define void @test_vext_s8() nounwind ssp {
   ; CHECK-LABEL: test_vext_s8:
diff --git a/llvm/test/CodeGen/ARM64/vext_reverse.ll b/llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/vext_reverse.ll
rename to llvm/test/CodeGen/AArch64/arm64-vext_reverse.ll
diff --git a/llvm/test/CodeGen/ARM64/vfloatintrinsics.ll b/llvm/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
similarity index 98%
rename from llvm/test/CodeGen/ARM64/vfloatintrinsics.ll
rename to llvm/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
index a8c882b..255a182 100644
--- a/llvm/test/CodeGen/ARM64/vfloatintrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s | FileCheck %s
 
 ;;; Float vectors
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
new file mode 100644
index 0000000..6178bf9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -0,0 +1,249 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd8b:
+;CHECK: shadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd16b:
+;CHECK: shadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd4h:
+;CHECK: shadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd8h:
+;CHECK: shadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd2s:
+;CHECK: shadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd4s:
+;CHECK: shadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd8b:
+;CHECK: uhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd16b:
+;CHECK: uhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd4h:
+;CHECK: uhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd8h:
+;CHECK: uhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd2s:
+;CHECK: uhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd4s:
+;CHECK: uhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd8b:
+;CHECK: srhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd16b:
+;CHECK: srhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd4h:
+;CHECK: srhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd8h:
+;CHECK: srhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd2s:
+;CHECK: srhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd4s:
+;CHECK: srhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd8b:
+;CHECK: urhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd16b:
+;CHECK: urhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd4h:
+;CHECK: urhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd8h:
+;CHECK: urhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd2s:
+;CHECK: urhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd4s:
+;CHECK: urhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhsub.ll b/llvm/test/CodeGen/AArch64/arm64-vhsub.ll
new file mode 100644
index 0000000..13bfda3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vhsub.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub8b:
+;CHECK: shsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub16b:
+;CHECK: shsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub4h:
+;CHECK: shsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub8h:
+;CHECK: shsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub2s:
+;CHECK: shsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub4s:
+;CHECK: shsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub8b:
+;CHECK: uhsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub16b:
+;CHECK: uhsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub4h:
+;CHECK: uhsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub8h:
+;CHECK: uhsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub2s:
+;CHECK: uhsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub4s:
+;CHECK: uhsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/virtual_base.ll b/llvm/test/CodeGen/AArch64/arm64-virtual_base.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/virtual_base.ll
rename to llvm/test/CodeGen/AArch64/arm64-virtual_base.ll
diff --git a/llvm/test/CodeGen/AArch64/arm64-vmax.ll b/llvm/test/CodeGen/AArch64/arm64-vmax.ll
new file mode 100644
index 0000000..3f2c134
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vmax.ll
@@ -0,0 +1,679 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_8b:
+;CHECK: smax.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_16b:
+;CHECK: smax.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_4h:
+;CHECK: smax.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_8h:
+;CHECK: smax.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_2s:
+;CHECK: smax.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_4s:
+;CHECK: smax.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_8b:
+;CHECK: umax.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_16b:
+;CHECK: umax.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_4h:
+;CHECK: umax.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_8h:
+;CHECK: umax.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_2s:
+;CHECK: umax.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_4s:
+;CHECK: umax.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_8b:
+;CHECK: smin.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_16b:
+;CHECK: smin.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_4h:
+;CHECK: smin.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_8h:
+;CHECK: smin.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_2s:
+;CHECK: smin.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_4s:
+;CHECK: smin.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_8b:
+;CHECK: umin.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_16b:
+;CHECK: umin.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_4h:
+;CHECK: umin.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_8h:
+;CHECK: umin.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_2s:
+;CHECK: umin.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_4s:
+;CHECK: umin.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_8b:
+;CHECK: smaxp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_16b:
+;CHECK: smaxp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_4h:
+;CHECK: smaxp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_8h:
+;CHECK: smaxp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_2s:
+;CHECK: smaxp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_4s:
+;CHECK: smaxp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_8b:
+;CHECK: umaxp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_16b:
+;CHECK: umaxp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_4h:
+;CHECK: umaxp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_8h:
+;CHECK: umaxp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_2s:
+;CHECK: umaxp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_4s:
+;CHECK: umaxp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_8b:
+;CHECK: sminp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_16b:
+;CHECK: sminp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_4h:
+;CHECK: sminp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_8h:
+;CHECK: sminp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_2s:
+;CHECK: sminp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_4s:
+;CHECK: sminp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_8b:
+;CHECK: uminp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_16b:
+;CHECK: uminp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_4h:
+;CHECK: uminp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_8h:
+;CHECK: uminp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_2s:
+;CHECK: uminp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_4s:
+;CHECK: uminp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_2s:
+;CHECK: fmax.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_4s:
+;CHECK: fmax.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmax_2d:
+;CHECK: fmax.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2s:
+;CHECK: fmaxp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_4s:
+;CHECK: fmaxp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2d:
+;CHECK: fmaxp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_2s:
+;CHECK: fmin.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_4s:
+;CHECK: fmin.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmin_2d:
+;CHECK: fmin.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_2s:
+;CHECK: fminp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_4s:
+;CHECK: fminp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminp_2d:
+;CHECK: fminp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2s:
+;CHECK: fminnmp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_4s:
+;CHECK: fminnmp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2d:
+;CHECK: fminnmp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2s:
+;CHECK: fmaxnmp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_4s:
+;CHECK: fmaxnmp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2d:
+;CHECK: fmaxnmp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/arm64-vminmaxnm.ll b/llvm/test/CodeGen/AArch64/arm64-vminmaxnm.ll
new file mode 100644
index 0000000..b5aca45
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vminmaxnm.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vmaxnm2.i
+}
+
+define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.4s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vmaxnm2.i
+}
+
+define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2d	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vmaxnm2.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.2s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vminnm2.i
+}
+
+define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.4s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vminnm2.i
+}
+
+define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fminnm.2d	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vminnm2.i
+}
+
+declare <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+
+define double @test_fmaxnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fmaxnmv:
+; CHECK: fmaxnmp.2d d0, v0
+  %max = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+define double @test_fminnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fminnmv:
+; CHECK: fminnmp.2d d0, v0
+  %min = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double>)
+declare double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/llvm/test/CodeGen/ARM64/vmovn.ll b/llvm/test/CodeGen/AArch64/arm64-vmovn.ll
similarity index 74%
rename from llvm/test/CodeGen/ARM64/vmovn.ll
rename to llvm/test/CodeGen/AArch64/arm64-vmovn.ll
index 675633b..67e2816 100644
--- a/llvm/test/CodeGen/ARM64/vmovn.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmovn.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @xtn8b(<8 x i16> %A) nounwind {
 ;CHECK-LABEL: xtn8b:
@@ -62,7 +62,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn.8b v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
         ret <8 x i8> %tmp3
 }
 
@@ -71,7 +71,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn.4h v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
         ret <4 x i16> %tmp3
 }
 
@@ -80,7 +80,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn.2s v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
         ret <2 x i32> %tmp3
 }
 
@@ -89,7 +89,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn2.16b v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> %A)
         %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %res
 }
@@ -99,7 +99,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn2.8h v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> %A)
         %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %res
 }
@@ -109,21 +109,21 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtn2.4s v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> %A)
         %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %res
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
 
 define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind {
 ;CHECK-LABEL: uqxtn8b:
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn.8b v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
         ret <8 x i8> %tmp3
 }
 
@@ -132,7 +132,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn.4h v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
         ret <4 x i16> %tmp3
 }
 
@@ -141,7 +141,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn.2s v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
         ret <2 x i32> %tmp3
 }
 
@@ -150,7 +150,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn2.16b v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> %A)
         %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %res
 }
@@ -160,7 +160,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn2.8h v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> %A)
         %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %res
 }
@@ -170,21 +170,21 @@
 ;CHECK-NOT: ld1
 ;CHECK: uqxtn2.4s v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> %A)
         %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %res
 }
 
-declare <8 x i8>  @llvm.arm64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
 
 define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind {
 ;CHECK-LABEL: sqxtun8b:
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun.8b v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
         ret <8 x i8> %tmp3
 }
 
@@ -193,7 +193,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun.4h v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
         ret <4 x i16> %tmp3
 }
 
@@ -202,7 +202,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun.2s v0, v0
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
         ret <2 x i32> %tmp3
 }
 
@@ -211,7 +211,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun2.16b v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> %A)
         %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %res
 }
@@ -221,7 +221,7 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun2.8h v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> %A)
         %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %res
 }
@@ -231,12 +231,12 @@
 ;CHECK-NOT: ld1
 ;CHECK: sqxtun2.4s v0, v1
 ;CHECK-NEXT: ret
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> %A)
         %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %res
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
 
diff --git a/llvm/test/CodeGen/ARM64/vmul.ll b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
similarity index 80%
rename from llvm/test/CodeGen/ARM64/vmul.ll
rename to llvm/test/CodeGen/AArch64/arm64-vmul.ll
index b6bd16a..6fa60fe 100644
--- a/llvm/test/CodeGen/ARM64/vmul.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vmul.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc -asm-verbose=false < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 
 define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
@@ -6,7 +6,7 @@
 ;CHECK: smull.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -15,7 +15,7 @@
 ;CHECK: smull.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -24,20 +24,20 @@
 ;CHECK: smull.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
-declare <8 x i16>  @llvm.arm64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
 define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: umull8h:
 ;CHECK: umull.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -46,7 +46,7 @@
 ;CHECK: umull.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -55,20 +55,20 @@
 ;CHECK: umull.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
-declare <8 x i16>  @llvm.arm64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+declare <8 x i16>  @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
 define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmull4s:
 ;CHECK: sqdmull.4s
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -77,7 +77,7 @@
 ;CHECK: sqdmull.2d
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
@@ -88,7 +88,7 @@
   %load2 = load <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -99,31 +99,31 @@
   %load2 = load <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp3
 }
 
 
-declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
 
 define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: pmull8h:
 ;CHECK: pmull.8h
   %tmp1 = load <8 x i8>* %A
   %tmp2 = load <8 x i8>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
   ret <8 x i16> %tmp3
 }
 
-declare <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
 
 define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmulh_4h:
 ;CHECK: sqdmulh.4h
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i16> %tmp3
 }
 
@@ -132,7 +132,7 @@
 ;CHECK: sqdmulh.8h
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -141,7 +141,7 @@
 ;CHECK: sqdmulh.2s
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i32> %tmp3
 }
 
@@ -150,7 +150,7 @@
 ;CHECK: sqdmulh.4s
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -159,22 +159,22 @@
 ;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
   %tmp1 = load i32* %A
   %tmp2 = load i32* %B
-  %tmp3 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
+  %tmp3 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
   ret i32 %tmp3
 }
 
-declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare i32 @llvm.arm64.neon.sqdmulh.i32(i32, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqdmulh.i32(i32, i32) nounwind readnone
 
 define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqrdmulh_4h:
 ;CHECK: sqrdmulh.4h
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i16> %tmp3
 }
 
@@ -183,7 +183,7 @@
 ;CHECK: sqrdmulh.8h
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
-  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
   ret <8 x i16> %tmp3
 }
 
@@ -192,7 +192,7 @@
 ;CHECK: sqrdmulh.2s
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i32> %tmp3
 }
 
@@ -201,7 +201,7 @@
 ;CHECK: sqrdmulh.4s
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
-  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
   ret <4 x i32> %tmp3
 }
 
@@ -210,22 +210,22 @@
 ;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
   %tmp1 = load i32* %A
   %tmp2 = load i32* %B
-  %tmp3 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
+  %tmp3 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
   ret i32 %tmp3
 }
 
-declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare i32 @llvm.arm64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
 
 define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
 ;CHECK-LABEL: fmulx_2s:
 ;CHECK: fmulx.2s
   %tmp1 = load <2 x float>* %A
   %tmp2 = load <2 x float>* %B
-  %tmp3 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+  %tmp3 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
   ret <2 x float> %tmp3
 }
 
@@ -234,7 +234,7 @@
 ;CHECK: fmulx.4s
   %tmp1 = load <4 x float>* %A
   %tmp2 = load <4 x float>* %B
-  %tmp3 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+  %tmp3 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
   ret <4 x float> %tmp3
 }
 
@@ -243,13 +243,13 @@
 ;CHECK: fmulx.2d
   %tmp1 = load <2 x double>* %A
   %tmp2 = load <2 x double>* %B
-  %tmp3 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+  %tmp3 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
   ret <2 x double> %tmp3
 }
 
-declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: smlal4s:
@@ -257,7 +257,7 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = add <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -268,7 +268,7 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = add <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
@@ -279,7 +279,7 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = sub <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -290,15 +290,15 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = sub <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
 
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
 
 define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal4s:
@@ -306,8 +306,8 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -317,8 +317,8 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -330,8 +330,8 @@
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -343,8 +343,8 @@
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -354,8 +354,8 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -365,8 +365,8 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -378,8 +378,8 @@
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
   ret <4 x i32> %tmp5
 }
 
@@ -391,8 +391,8 @@
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
   ret <2 x i64> %tmp5
 }
 
@@ -402,7 +402,7 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = add <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -413,7 +413,7 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = add <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
@@ -424,7 +424,7 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   %tmp5 = sub <4 x i32> %tmp3, %tmp4
   ret <4 x i32> %tmp5
 }
@@ -435,7 +435,7 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   %tmp5 = sub <2 x i64> %tmp3, %tmp4
   ret <2 x i64> %tmp5
 }
@@ -717,7 +717,7 @@
   %tmp1 = load <2 x float>* %A
   %tmp2 = load <2 x float>* %B
   %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
+  %tmp4 = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
   ret <2 x float> %tmp4
 }
 
@@ -728,7 +728,7 @@
   %tmp1 = load <4 x float>* %A
   %tmp2 = load <4 x float>* %B
   %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
+  %tmp4 = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
   ret <4 x float> %tmp4
 }
 
@@ -739,7 +739,7 @@
   %tmp1 = load <2 x double>* %A
   %tmp2 = load <2 x double>* %B
   %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
+  %tmp4 = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
   ret <2 x double> %tmp4
 }
 
@@ -750,7 +750,7 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i16> %tmp4
 }
 
@@ -761,7 +761,7 @@
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
   ret <8 x i16> %tmp4
 }
 
@@ -772,7 +772,7 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i32> %tmp4
 }
 
@@ -783,7 +783,7 @@
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -792,7 +792,7 @@
 ;CHECK-NOT: dup
 ;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
   %tmp1 = extractelement <4 x i32> %B, i32 1
-  %tmp2 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
+  %tmp2 = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
   ret i32 %tmp2
 }
 
@@ -803,7 +803,7 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i16> %tmp4
 }
 
@@ -814,7 +814,7 @@
   %tmp1 = load <8 x i16>* %A
   %tmp2 = load <8 x i16>* %B
   %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  %tmp4 = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
   ret <8 x i16> %tmp4
 }
 
@@ -825,7 +825,7 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i32> %tmp4
 }
 
@@ -836,7 +836,7 @@
   %tmp1 = load <4 x i32>* %A
   %tmp2 = load <4 x i32>* %B
   %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -845,7 +845,7 @@
 ;CHECK-NOT: dup
 ;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
   %tmp1 = extractelement <4 x i32> %B, i32 1
-  %tmp2 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
+  %tmp2 = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
   ret i32 %tmp2
 }
 
@@ -856,7 +856,7 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -867,7 +867,7 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i64> %tmp4
 }
 
@@ -879,7 +879,7 @@
   %load2 = load <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
   ret <4 x i32> %tmp4
 }
 
@@ -891,7 +891,7 @@
   %load2 = load <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
   ret <2 x i64> %tmp4
 }
 
@@ -902,7 +902,7 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -913,7 +913,7 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i64> %tmp4
 }
 
@@ -924,7 +924,7 @@
   %tmp1 = load <4 x i16>* %A
   %tmp2 = load <4 x i16>* %B
   %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  %tmp4 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
   ret <4 x i32> %tmp4
 }
 
@@ -935,7 +935,7 @@
   %tmp1 = load <2 x i32>* %A
   %tmp2 = load <2 x i32>* %B
   %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  %tmp4 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
   ret <2 x i64> %tmp4
 }
 
@@ -947,7 +947,7 @@
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = add <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -960,7 +960,7 @@
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = add <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -973,8 +973,8 @@
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -986,8 +986,8 @@
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1000,8 +1000,8 @@
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -1014,8 +1014,8 @@
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1024,45 +1024,45 @@
 ;CHECK: sqdmlal.4s
   %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
   %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
   %prod = extractelement <4 x i32> %prod.vec, i32 0
-  %res = call i32 @llvm.arm64.neon.sqadd.i32(i32 %A, i32 %prod)
+  %res = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %A, i32 %prod)
   ret i32 %res
 }
-declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32)
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
 
 define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
 ;CHECK-LABEL: sqdmlsl_lane_1s:
 ;CHECK: sqdmlsl.4s
   %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
   %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod.vec = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
   %prod = extractelement <4 x i32> %prod.vec, i32 0
-  %res = call i32 @llvm.arm64.neon.sqsub.i32(i32 %A, i32 %prod)
+  %res = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %A, i32 %prod)
   ret i32 %res
 }
-declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32)
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
 
 define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: sqdmlal_lane_1d:
 ;CHECK: sqdmlal.s
   %rhs = extractelement <2 x i32> %C, i32 1
-  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
-  %res = call i64 @llvm.arm64.neon.sqadd.i64(i64 %A, i64 %prod)
+  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %A, i64 %prod)
   ret i64 %res
 }
-declare i64 @llvm.arm64.neon.sqdmulls.scalar(i32, i32)
-declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqdmulls.scalar(i32, i32)
+declare i64 @llvm.aarch64.neon.sqadd.i64(i64, i64)
 
 define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
 ;CHECK-LABEL: sqdmlsl_lane_1d:
 ;CHECK: sqdmlsl.s
   %rhs = extractelement <2 x i32> %C, i32 1
-  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
-  %res = call i64 @llvm.arm64.neon.sqsub.i64(i64 %A, i64 %prod)
+  %prod = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %A, i64 %prod)
   ret i64 %res
 }
-declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64)
+declare i64 @llvm.aarch64.neon.sqsub.i64(i64, i64)
 
 
 define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
@@ -1073,7 +1073,7 @@
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = add <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -1086,7 +1086,7 @@
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = add <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -1100,7 +1100,7 @@
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = sub <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -1113,7 +1113,7 @@
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = sub <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -1126,8 +1126,8 @@
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -1139,8 +1139,8 @@
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1153,8 +1153,8 @@
   %tmp3 = load <4 x i32>* %C
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
-  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
   ret <4 x i32> %tmp6
 }
 
@@ -1167,8 +1167,8 @@
   %tmp3 = load <2 x i64>* %C
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
-  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
   ret <2 x i64> %tmp6
 }
 
@@ -1180,7 +1180,7 @@
   %tmp2 = load <4 x i16>* %B
   %tmp3 = load <4 x i32>* %C
   %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp5 = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
   %tmp6 = sub <4 x i32> %tmp3, %tmp5
   ret <4 x i32> %tmp6
 }
@@ -1193,7 +1193,7 @@
   %tmp2 = load <2 x i32>* %B
   %tmp3 = load <2 x i64>* %C
   %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
-  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp5 = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
   %tmp6 = sub <2 x i64> %tmp3, %tmp5
   ret <2 x i64> %tmp6
 }
@@ -1202,7 +1202,7 @@
 define float @fmulxs(float %a, float %b) nounwind {
 ; CHECK-LABEL: fmulxs:
 ; CHECKNEXT: fmulx s0, s0, s1
-  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
+  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
 ; CHECKNEXT: ret
   ret float %fmulx.i
 }
@@ -1210,7 +1210,7 @@
 define double @fmulxd(double %a, double %b) nounwind {
 ; CHECK-LABEL: fmulxd:
 ; CHECKNEXT: fmulx d0, d0, d1
-  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
+  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
 ; CHECKNEXT: ret
   ret double %fmulx.i
 }
@@ -1219,7 +1219,7 @@
 ; CHECK-LABEL: fmulxs_lane:
 ; CHECKNEXT: fmulx.s s0, s0, v1[3]
   %b = extractelement <4 x float> %vec, i32 3
-  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
+  %fmulx.i = tail call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) nounwind
 ; CHECKNEXT: ret
   ret float %fmulx.i
 }
@@ -1228,13 +1228,13 @@
 ; CHECK-LABEL: fmulxd_lane:
 ; CHECKNEXT: fmulx d0, d0, v1[1]
   %b = extractelement <2 x double> %vec, i32 1
-  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
+  %fmulx.i = tail call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) nounwind
 ; CHECKNEXT: ret
   ret double %fmulx.i
 }
 
-declare double @llvm.arm64.neon.fmulx.f64(double, double) nounwind readnone
-declare float @llvm.arm64.neon.fmulx.f32(float, float) nounwind readnone
+declare double @llvm.aarch64.neon.fmulx.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.fmulx.f32(float, float) nounwind readnone
 
 
 define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
@@ -1243,7 +1243,7 @@
 ; CHECK-NEXT: ret
   %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %3 = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
+  %3 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
   ret <8 x i16> %3
 }
 
@@ -1256,7 +1256,7 @@
   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1269,7 +1269,7 @@
   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1282,7 +1282,7 @@
   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1295,7 +1295,7 @@
   %tmp2 = bitcast <16 x i8> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
-  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   ret <8 x i16> %vmull.i.i
 }
 
@@ -1308,7 +1308,7 @@
   %tmp2 = bitcast <8 x i16> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1321,7 +1321,7 @@
   %tmp2 = bitcast <4 x i32> %b to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1334,7 +1334,7 @@
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
   ret <4 x i32> %vmull2.i
 }
 
@@ -1347,7 +1347,7 @@
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
   ret <2 x i64> %vmull2.i
 }
 
@@ -1360,7 +1360,7 @@
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
   %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
   ret <4 x i32> %vmull2.i
 }
 
@@ -1373,7 +1373,7 @@
   %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
   %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
   %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
   ret <2 x i64> %vmull2.i
 }
 
@@ -1388,7 +1388,7 @@
   %tmp2 = bitcast <16 x i8> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   %add.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i
 }
@@ -1404,7 +1404,7 @@
   %tmp2 = bitcast <8 x i16> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i
 }
@@ -1420,7 +1420,7 @@
   %tmp2 = bitcast <4 x i32> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i
 }
@@ -1436,7 +1436,7 @@
   %tmp2 = bitcast <16 x i8> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
-  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %vmull.i.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
   %add.i = add <8 x i16> %vmull.i.i.i, %a
   ret <8 x i16> %add.i
 }
@@ -1452,7 +1452,7 @@
   %tmp2 = bitcast <8 x i16> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
-  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add.i = add <4 x i32> %vmull2.i.i.i, %a
   ret <4 x i32> %add.i
 }
@@ -1468,7 +1468,7 @@
   %tmp2 = bitcast <4 x i32> %c to <2 x i64>
   %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
-  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add.i = add <2 x i64> %vmull2.i.i.i, %a
   ret <2 x i64> %add.i
 }
@@ -1484,7 +1484,7 @@
   %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add
 }
@@ -1500,7 +1500,7 @@
   %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add
 }
@@ -1517,7 +1517,7 @@
   %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
   %add = add <4 x i32> %vmull2.i.i, %a
   ret <4 x i32> %add
 }
@@ -1533,7 +1533,7 @@
   %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
   %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
   %add = add <2 x i64> %vmull2.i.i, %a
   ret <2 x i64> %add
 }
@@ -1631,7 +1631,7 @@
 ; CHECK: smull.4s v0, v0, v1[6]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
   ret <4 x i32> %vmull2.i
 }
 
@@ -1642,7 +1642,7 @@
 ; CHECK: smull.2d v0, v0, v1[2]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
   ret <2 x i64> %vmull2.i
 }
 define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
@@ -1652,7 +1652,7 @@
 ; CHECK: umull.4s v0, v0, v1[6]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
-  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
   ret <4 x i32> %vmull2.i
 }
 
@@ -1663,7 +1663,7 @@
 ; CHECK: umull.2d v0, v0, v1[2]
 ; CHECK-NEXT: ret
   %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
-  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
   ret <2 x i64> %vmull2.i
 }
 
@@ -1681,7 +1681,7 @@
   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1696,7 +1696,7 @@
   %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1714,7 +1714,7 @@
   %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
   %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
   %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
-  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
   ret <4 x i32> %vmull2.i.i
 }
 
@@ -1729,7 +1729,7 @@
   %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
   %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
   %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
-  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
   ret <2 x i64> %vmull2.i.i
 }
 
@@ -1787,7 +1787,7 @@
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   ret <2 x i64> %res
 }
 
@@ -1799,8 +1799,8 @@
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
   ret <2 x i64> %sum
 }
 
@@ -1813,7 +1813,7 @@
 
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
   ret <2 x i64> %res
 }
 
@@ -1826,7 +1826,7 @@
 
   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 
-  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
   ret <8 x i16> %res
 }
 
@@ -1838,7 +1838,7 @@
   %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
 
-  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+  %res = tail call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
   ret <8 x i16> %res
 }
 
@@ -1850,7 +1850,7 @@
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   ret <2 x i64> %res
 }
 
@@ -1862,8 +1862,8 @@
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
-  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  %res = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
   ret <2 x i64> %sum
 }
 
@@ -1875,7 +1875,7 @@
   %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
 
-  %res = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %res = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
   %sum = add <2 x i64> %accum, %res
   ret <2 x i64> %sum
 }
@@ -1997,23 +1997,23 @@
 define i64 @sqdmlal_d(i32 %A, i32 %B, i64 %C) nounwind {
 ;CHECK-LABEL: sqdmlal_d:
 ;CHECK: sqdmlal
-  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
-  %tmp5 = call i64 @llvm.arm64.neon.sqadd.i64(i64 %C, i64 %tmp4)
+  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %C, i64 %tmp4)
   ret i64 %tmp5
 }
 
 define i64 @sqdmlsl_d(i32 %A, i32 %B, i64 %C) nounwind {
 ;CHECK-LABEL: sqdmlsl_d:
 ;CHECK: sqdmlsl
-  %tmp4 = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %A, i32 %B)
-  %tmp5 = call i64 @llvm.arm64.neon.sqsub.i64(i64 %C, i64 %tmp4)
+  %tmp4 = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %A, i32 %B)
+  %tmp5 = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %C, i64 %tmp4)
   ret i64 %tmp5
 }
 
 define <16 x i8> @test_pmull_64(i64 %l, i64 %r) nounwind {
 ; CHECK-LABEL: test_pmull_64:
 ; CHECK: pmull.1q
-  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l, i64 %r)
+  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l, i64 %r)
   ret <16 x i8> %val
 }
 
@@ -2022,11 +2022,11 @@
 ; CHECK: pmull2.1q
   %l_hi = extractelement <2 x i64> %l, i32 1
   %r_hi = extractelement <2 x i64> %r, i32 1
-  %val = call <16 x i8> @llvm.arm64.neon.pmull64(i64 %l_hi, i64 %r_hi)
+  %val = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %l_hi, i64 %r_hi)
   ret <16 x i8> %val
 }
 
-declare <16 x i8> @llvm.arm64.neon.pmull64(i64, i64)
+declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)
 
 define <1 x i64> @test_mul_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) nounwind {
 ; CHECK-LABEL: test_mul_v1i64:
diff --git a/llvm/test/CodeGen/ARM64/volatile.ll b/llvm/test/CodeGen/AArch64/arm64-volatile.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/volatile.ll
rename to llvm/test/CodeGen/AArch64/arm64-volatile.ll
diff --git a/llvm/test/CodeGen/ARM64/vpopcnt.ll b/llvm/test/CodeGen/AArch64/arm64-vpopcnt.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/vpopcnt.ll
rename to llvm/test/CodeGen/AArch64/arm64-vpopcnt.ll
diff --git a/llvm/test/CodeGen/AArch64/arm64-vqadd.ll b/llvm/test/CodeGen/AArch64/arm64-vqadd.ll
new file mode 100644
index 0000000..20f7e2c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vqadd.ll
@@ -0,0 +1,332 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd8b:
+;CHECK: sqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd4h:
+;CHECK: sqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd2s:
+;CHECK: sqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd8b:
+;CHECK: uqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd4h:
+;CHECK: uqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd2s:
+;CHECK: uqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd16b:
+;CHECK: sqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd8h:
+;CHECK: sqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd4s:
+;CHECK: sqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqadd2d:
+;CHECK: sqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd16b:
+;CHECK: uqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd8h:
+;CHECK: uqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd4s:
+;CHECK: uqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqadd2d:
+;CHECK: uqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd8b:
+;CHECK: usqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd4h:
+;CHECK: usqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd2s:
+;CHECK: usqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd16b:
+;CHECK: usqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd8h:
+;CHECK: usqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd4s:
+;CHECK: usqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usqadd2d:
+;CHECK: usqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define i64 @usqadd_d(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: usqadd_d:
+; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %l, i64 %r)
+  ret i64 %sum
+}
+
+define i32 @usqadd_s(i32 %l, i32 %r) nounwind {
+; CHECK-LABEL: usqadd_s:
+; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
+  %sum = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %l, i32 %r)
+  ret i32 %sum
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.usqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.usqadd.i32(i32, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd8b:
+;CHECK: suqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd4h:
+;CHECK: suqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd2s:
+;CHECK: suqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd16b:
+;CHECK: suqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd8h:
+;CHECK: suqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd4s:
+;CHECK: suqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: suqadd2d:
+;CHECK: suqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i64> @suqadd_1d(<1 x i64> %l, <1 x i64> %r) nounwind {
+; CHECK-LABEL: suqadd_1d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
+  ret <1 x i64> %sum
+}
+
+define i64 @suqadd_d(i64 %l, i64 %r) nounwind {
+; CHECK-LABEL: suqadd_d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %l, i64 %r)
+  ret i64 %sum
+}
+
+define i32 @suqadd_s(i32 %l, i32 %r) nounwind {
+; CHECK-LABEL: suqadd_s:
+; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
+  %sum = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %l, i32 %r)
+  ret i32 %sum
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare i64 @llvm.aarch64.neon.suqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.aarch64.neon.suqadd.i32(i32, i32) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/arm64-vqsub.ll b/llvm/test/CodeGen/AArch64/arm64-vqsub.ll
new file mode 100644
index 0000000..dde3ac3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vqsub.ll
@@ -0,0 +1,147 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub8b:
+;CHECK: sqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub4h:
+;CHECK: sqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub2s:
+;CHECK: sqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub8b:
+;CHECK: uqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub4h:
+;CHECK: uqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub2s:
+;CHECK: uqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub16b:
+;CHECK: sqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub8h:
+;CHECK: sqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub4s:
+;CHECK: sqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqsub2d:
+;CHECK: sqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub16b:
+;CHECK: uqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub8h:
+;CHECK: uqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub4s:
+;CHECK: uqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqsub2d:
+;CHECK: uqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.aarch64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.aarch64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vselect.ll b/llvm/test/CodeGen/AArch64/arm64-vselect.ll
similarity index 88%
rename from llvm/test/CodeGen/ARM64/vselect.ll
rename to llvm/test/CodeGen/AArch64/arm64-vselect.ll
index aa8e81e..9988512 100644
--- a/llvm/test/CodeGen/ARM64/vselect.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vselect.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 ;CHECK: @func63
 ;CHECK: cmeq.4h v0, v0, v1
diff --git a/llvm/test/CodeGen/ARM64/vsetcc_fp.ll b/llvm/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
similarity index 80%
rename from llvm/test/CodeGen/ARM64/vsetcc_fp.ll
rename to llvm/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
index c93aad5..f4f4714 100644
--- a/llvm/test/CodeGen/ARM64/vsetcc_fp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vsetcc_fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
 define <2 x i32> @fcmp_one(<2 x float> %x, <2 x float> %y) nounwind optsize readnone {
 ; CHECK-LABEL: fcmp_one:
 ; CHECK-NEXT: fcmgt.2s [[REG:v[0-9]+]], v0, v1
diff --git a/llvm/test/CodeGen/ARM64/vshift.ll b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
similarity index 68%
rename from llvm/test/CodeGen/ARM64/vshift.ll
rename to llvm/test/CodeGen/AArch64/arm64-vshift.ll
index 486c6cc..82ae486 100644
--- a/llvm/test/CodeGen/ARM64/vshift.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -enable-misched=false | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple -enable-misched=false | FileCheck %s
 
 define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sqshl8b:
 ;CHECK: sqshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@
 ;CHECK: sqshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@
 ;CHECK: sqshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -32,7 +32,7 @@
 ;CHECK: uqshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -41,7 +41,7 @@
 ;CHECK: uqshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -50,7 +50,7 @@
 ;CHECK: uqshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -59,7 +59,7 @@
 ;CHECK: sqshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -68,7 +68,7 @@
 ;CHECK: sqshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -77,7 +77,7 @@
 ;CHECK: sqshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -86,7 +86,7 @@
 ;CHECK: sqshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
@@ -95,7 +95,7 @@
 ;CHECK: uqshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -104,7 +104,7 @@
 ;CHECK: uqshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -113,7 +113,7 @@
 ;CHECK: uqshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -122,36 +122,36 @@
 ;CHECK: uqshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: srshl8b:
 ;CHECK: srshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -160,7 +160,7 @@
 ;CHECK: srshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -169,7 +169,7 @@
 ;CHECK: srshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -178,7 +178,7 @@
 ;CHECK: urshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -187,7 +187,7 @@
 ;CHECK: urshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -196,7 +196,7 @@
 ;CHECK: urshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -205,7 +205,7 @@
 ;CHECK: srshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -214,7 +214,7 @@
 ;CHECK: srshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -223,7 +223,7 @@
 ;CHECK: srshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -232,7 +232,7 @@
 ;CHECK: srshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
@@ -241,7 +241,7 @@
 ;CHECK: urshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -250,7 +250,7 @@
 ;CHECK: urshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -259,7 +259,7 @@
 ;CHECK: urshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -268,36 +268,36 @@
 ;CHECK: urshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: sqrshl8b:
 ;CHECK: sqrshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -306,7 +306,7 @@
 ;CHECK: sqrshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -315,7 +315,7 @@
 ;CHECK: sqrshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -324,7 +324,7 @@
 ;CHECK: uqrshl.8b
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -333,7 +333,7 @@
 ;CHECK: uqrshl.4h
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -342,7 +342,7 @@
 ;CHECK: uqrshl.2s
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -351,7 +351,7 @@
 ;CHECK: sqrshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -360,7 +360,7 @@
 ;CHECK: sqrshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -369,7 +369,7 @@
 ;CHECK: sqrshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -378,7 +378,7 @@
 ;CHECK: sqrshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
@@ -387,7 +387,7 @@
 ;CHECK: uqrshl.16b
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
         ret <16 x i8> %tmp3
 }
 
@@ -396,7 +396,7 @@
 ;CHECK: uqrshl.8h
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i16> %tmp3
 }
 
@@ -405,7 +405,7 @@
 ;CHECK: uqrshl.4s
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i32> %tmp3
 }
 
@@ -414,35 +414,35 @@
 ;CHECK: uqrshl.2d
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: urshr8b:
 ;CHECK: urshr.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <8 x i8> %tmp3
 }
 
@@ -450,7 +450,7 @@
 ;CHECK-LABEL: urshr4h:
 ;CHECK: urshr.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <4 x i16> %tmp3
 }
 
@@ -458,7 +458,7 @@
 ;CHECK-LABEL: urshr2s:
 ;CHECK: urshr.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         ret <2 x i32> %tmp3
 }
 
@@ -466,7 +466,7 @@
 ;CHECK-LABEL: urshr16b:
 ;CHECK: urshr.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <16 x i8> %tmp3
 }
 
@@ -474,7 +474,7 @@
 ;CHECK-LABEL: urshr8h:
 ;CHECK: urshr.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <8 x i16> %tmp3
 }
 
@@ -482,7 +482,7 @@
 ;CHECK-LABEL: urshr4s:
 ;CHECK: urshr.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         ret <4 x i32> %tmp3
 }
 
@@ -490,7 +490,7 @@
 ;CHECK-LABEL: urshr2d:
 ;CHECK: urshr.2d
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         ret <2 x i64> %tmp3
 }
 
@@ -498,7 +498,7 @@
 ;CHECK-LABEL: srshr8b:
 ;CHECK: srshr.8b
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <8 x i8> %tmp3
 }
 
@@ -506,7 +506,7 @@
 ;CHECK-LABEL: srshr4h:
 ;CHECK: srshr.4h
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <4 x i16> %tmp3
 }
 
@@ -514,7 +514,7 @@
 ;CHECK-LABEL: srshr2s:
 ;CHECK: srshr.2s
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         ret <2 x i32> %tmp3
 }
 
@@ -522,7 +522,7 @@
 ;CHECK-LABEL: srshr16b:
 ;CHECK: srshr.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         ret <16 x i8> %tmp3
 }
 
@@ -530,7 +530,7 @@
 ;CHECK-LABEL: srshr8h:
 ;CHECK: srshr.8h
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         ret <8 x i16> %tmp3
 }
 
@@ -538,7 +538,7 @@
 ;CHECK-LABEL: srshr4s:
 ;CHECK: srshr.4s
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         ret <4 x i32> %tmp3
 }
 
@@ -546,7 +546,7 @@
 ;CHECK-LABEL: srshr2d:
 ;CHECK: srshr.2d
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         ret <2 x i64> %tmp3
 }
 
@@ -554,7 +554,7 @@
 ;CHECK-LABEL: sqshlu8b:
 ;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <8 x i8> %tmp3
 }
 
@@ -562,7 +562,7 @@
 ;CHECK-LABEL: sqshlu4h:
 ;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
         ret <4 x i16> %tmp3
 }
 
@@ -570,7 +570,7 @@
 ;CHECK-LABEL: sqshlu2s:
 ;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
         ret <2 x i32> %tmp3
 }
 
@@ -578,7 +578,7 @@
 ;CHECK-LABEL: sqshlu16b:
 ;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <16 x i8> %tmp3
 }
 
@@ -586,7 +586,7 @@
 ;CHECK-LABEL: sqshlu8h:
 ;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
         ret <8 x i16> %tmp3
 }
 
@@ -594,7 +594,7 @@
 ;CHECK-LABEL: sqshlu4s:
 ;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
 }
 
@@ -602,25 +602,25 @@
 ;CHECK-LABEL: sqshlu2d:
 ;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: rshrn8b:
 ;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -628,7 +628,7 @@
 ;CHECK-LABEL: rshrn4h:
 ;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -636,7 +636,7 @@
 ;CHECK-LABEL: rshrn2s:
 ;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -645,7 +645,7 @@
 ;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -655,7 +655,7 @@
 ;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -665,14 +665,14 @@
 ;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare <8 x i8>  @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: shrn8b:
@@ -734,14 +734,14 @@
         ret <4 x i32> %tmp4
 }
 
-declare <8 x i8>  @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqshrn1s:
 ; CHECK: sqshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -749,7 +749,7 @@
 ;CHECK-LABEL: sqshrn8b:
 ;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -757,7 +757,7 @@
 ;CHECK-LABEL: sqshrn4h:
 ;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -765,7 +765,7 @@
 ;CHECK-LABEL: sqshrn2s:
 ;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -775,7 +775,7 @@
 ;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -785,7 +785,7 @@
 ;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -795,20 +795,20 @@
 ;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqshrun1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqshrun1s:
 ; CHECK: sqshrun {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqshrun.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -816,7 +816,7 @@
 ;CHECK-LABEL: sqshrun8b:
 ;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -824,7 +824,7 @@
 ;CHECK-LABEL: sqshrun4h:
 ;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -832,7 +832,7 @@
 ;CHECK-LABEL: sqshrun2s:
 ;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -841,7 +841,7 @@
 ;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -851,7 +851,7 @@
 ;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -861,20 +861,20 @@
 ;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqshrun.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqrshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqrshrn1s:
 ; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqrshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -882,7 +882,7 @@
 ;CHECK-LABEL: sqrshrn8b:
 ;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -890,7 +890,7 @@
 ;CHECK-LABEL: sqrshrn4h:
 ;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -898,7 +898,7 @@
 ;CHECK-LABEL: sqrshrn2s:
 ;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -907,7 +907,7 @@
 ;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -917,7 +917,7 @@
 ;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -927,20 +927,20 @@
 ;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqrshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @sqrshrun1s(i64 %A) nounwind {
 ; CHECK-LABEL: sqrshrun1s:
 ; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.sqrshrun.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -948,7 +948,7 @@
 ;CHECK-LABEL: sqrshrun8b:
 ;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -956,7 +956,7 @@
 ;CHECK-LABEL: sqrshrun4h:
 ;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -964,7 +964,7 @@
 ;CHECK-LABEL: sqrshrun2s:
 ;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -973,7 +973,7 @@
 ;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -983,7 +983,7 @@
 ;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -993,20 +993,20 @@
 ;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.sqrshrun.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.sqrshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @uqrshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: uqrshrn1s:
 ; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.uqrshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -1014,7 +1014,7 @@
 ;CHECK-LABEL: uqrshrn8b:
 ;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -1022,7 +1022,7 @@
 ;CHECK-LABEL: uqrshrn4h:
 ;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -1030,7 +1030,7 @@
 ;CHECK-LABEL: uqrshrn2s:
 ;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -1039,7 +1039,7 @@
 ;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -1049,7 +1049,7 @@
 ;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
         %out = load <4 x i16>* %ret
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
         %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
         ret <8 x i16> %tmp4
 }
@@ -1059,20 +1059,20 @@
 ;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
         %out = load <2 x i32>* %ret
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
         %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
         ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.uqrshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.uqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define i32 @uqshrn1s(i64 %A) nounwind {
 ; CHECK-LABEL: uqshrn1s:
 ; CHECK: uqshrn {{s[0-9]+}}, d0, #1
-  %tmp = call i32 @llvm.arm64.neon.uqshrn.i32(i64 %A, i32 1)
+  %tmp = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %A, i32 1)
   ret i32 %tmp
 }
 
@@ -1080,7 +1080,7 @@
 ;CHECK-LABEL: uqshrn8b:
 ;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -1088,7 +1088,7 @@
 ;CHECK-LABEL: uqshrn4h:
 ;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -1096,7 +1096,7 @@
 ;CHECK-LABEL: uqshrn2s:
 ;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -1105,7 +1105,7 @@
 ;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
         %out = load <8 x i8>* %ret
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
         %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         ret <16 x i8> %tmp4
 }
@@ -1115,7 +1115,7 @@
 ;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
   %out = load <4 x i16>* %ret
   %tmp1 = load <4 x i32>* %A
-  %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+  %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
   %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %tmp4
 }
@@ -1125,15 +1125,15 @@
 ;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
   %out = load <2 x i32>* %ret
   %tmp1 = load <2 x i64>* %A
-  %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+  %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
   %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %tmp4
 }
 
-declare i32  @llvm.arm64.neon.uqshrn.i32(i64, i32) nounwind readnone
-declare <8 x i8>  @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare i32  @llvm.aarch64.neon.uqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
 
 define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
 ;CHECK-LABEL: ushll8h:
@@ -1253,7 +1253,7 @@
 ;CHECK-LABEL: sqshli8b:
 ;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <8 x i8> %tmp3
 }
 
@@ -1261,7 +1261,7 @@
 ;CHECK-LABEL: sqshli4h:
 ;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
         ret <4 x i16> %tmp3
 }
 
@@ -1269,7 +1269,7 @@
 ;CHECK-LABEL: sqshli2s:
 ;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
         ret <2 x i32> %tmp3
 }
 
@@ -1277,7 +1277,7 @@
 ;CHECK-LABEL: sqshli16b:
 ;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <16 x i8> %tmp3
 }
 
@@ -1285,7 +1285,7 @@
 ;CHECK-LABEL: sqshli8h:
 ;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
         ret <8 x i16> %tmp3
 }
 
@@ -1293,7 +1293,7 @@
 ;CHECK-LABEL: sqshli4s:
 ;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
 }
 
@@ -1301,7 +1301,7 @@
 ;CHECK-LABEL: sqshli2d:
 ;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
         ret <2 x i64> %tmp3
 }
 
@@ -1309,7 +1309,7 @@
 ;CHECK-LABEL: uqshli8b:
 ;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <8 x i8> %tmp3
 }
 
@@ -1317,7 +1317,7 @@
 ;CHECK-LABEL: uqshli4h:
 ;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
         ret <4 x i16> %tmp3
 }
 
@@ -1325,7 +1325,7 @@
 ;CHECK-LABEL: uqshli2s:
 ;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
         ret <2 x i32> %tmp3
 }
 
@@ -1333,7 +1333,7 @@
 ;CHECK-LABEL: uqshli16b:
 ;CHECK: uqshl.16b
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
         ret <16 x i8> %tmp3
 }
 
@@ -1341,7 +1341,7 @@
 ;CHECK-LABEL: uqshli8h:
 ;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
         ret <8 x i16> %tmp3
 }
 
@@ -1349,7 +1349,7 @@
 ;CHECK-LABEL: uqshli4s:
 ;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
         ret <4 x i32> %tmp3
 }
 
@@ -1357,7 +1357,7 @@
 ;CHECK-LABEL: uqshli2d:
 ;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
         ret <2 x i64> %tmp3
 }
 
@@ -1365,7 +1365,7 @@
 ;CHECK-LABEL: ursra8b:
 ;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <8 x i8>* %B
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -1375,7 +1375,7 @@
 ;CHECK-LABEL: ursra4h:
 ;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -1385,7 +1385,7 @@
 ;CHECK-LABEL: ursra2s:
 ;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -1395,7 +1395,7 @@
 ;CHECK-LABEL: ursra16b:
 ;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <16 x i8>* %B
         %tmp5 = add <16 x i8> %tmp3, %tmp4
          ret <16 x i8> %tmp5
@@ -1405,7 +1405,7 @@
 ;CHECK-LABEL: ursra8h:
 ;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
          ret <8 x i16> %tmp5
@@ -1415,7 +1415,7 @@
 ;CHECK-LABEL: ursra4s:
 ;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
          ret <4 x i32> %tmp5
@@ -1425,7 +1425,7 @@
 ;CHECK-LABEL: ursra2d:
 ;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
          ret <2 x i64> %tmp5
@@ -1435,7 +1435,7 @@
 ;CHECK-LABEL: srsra8b:
 ;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <8 x i8>* %B
         %tmp5 = add <8 x i8> %tmp3, %tmp4
         ret <8 x i8> %tmp5
@@ -1445,7 +1445,7 @@
 ;CHECK-LABEL: srsra4h:
 ;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <4 x i16>* %B
         %tmp5 = add <4 x i16> %tmp3, %tmp4
         ret <4 x i16> %tmp5
@@ -1455,7 +1455,7 @@
 ;CHECK-LABEL: srsra2s:
 ;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
         %tmp4 = load <2 x i32>* %B
         %tmp5 = add <2 x i32> %tmp3, %tmp4
         ret <2 x i32> %tmp5
@@ -1465,7 +1465,7 @@
 ;CHECK-LABEL: srsra16b:
 ;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
         %tmp4 = load <16 x i8>* %B
         %tmp5 = add <16 x i8> %tmp3, %tmp4
          ret <16 x i8> %tmp5
@@ -1475,7 +1475,7 @@
 ;CHECK-LABEL: srsra8h:
 ;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
         %tmp4 = load <8 x i16>* %B
         %tmp5 = add <8 x i16> %tmp3, %tmp4
          ret <8 x i16> %tmp5
@@ -1485,7 +1485,7 @@
 ;CHECK-LABEL: srsra4s:
 ;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
         %tmp4 = load <4 x i32>* %B
         %tmp5 = add <4 x i32> %tmp3, %tmp4
          ret <4 x i32> %tmp5
@@ -1495,7 +1495,7 @@
 ;CHECK-LABEL: srsra2d:
 ;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
         %tmp4 = load <2 x i64>* %B
         %tmp5 = add <2 x i64> %tmp3, %tmp4
          ret <2 x i64> %tmp5
@@ -1831,7 +1831,7 @@
 ;CHECK: sli.8b v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i8>* %A
         %tmp2 = load <8 x i8>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
         ret <8 x i8> %tmp3
 }
 
@@ -1840,7 +1840,7 @@
 ;CHECK: sli.4h v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i16>* %A
         %tmp2 = load <4 x i16>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
         ret <4 x i16> %tmp3
 }
 
@@ -1849,7 +1849,7 @@
 ;CHECK: sli.2s v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i32>* %A
         %tmp2 = load <2 x i32>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
         ret <2 x i32> %tmp3
 }
 
@@ -1858,7 +1858,7 @@
 ;CHECK: sli d0, {{d[0-9]+}}, #1
         %tmp1 = load <1 x i64>* %A
         %tmp2 = load <1 x i64>* %B
-        %tmp3 = call <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
+        %tmp3 = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
         ret <1 x i64> %tmp3
 }
 
@@ -1867,7 +1867,7 @@
 ;CHECK: sli.16b v0, {{v[0-9]+}}, #1
         %tmp1 = load <16 x i8>* %A
         %tmp2 = load <16 x i8>* %B
-        %tmp3 = call <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
+        %tmp3 = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
         ret <16 x i8> %tmp3
 }
 
@@ -1876,7 +1876,7 @@
 ;CHECK: sli.8h v0, {{v[0-9]+}}, #1
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
+        %tmp3 = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
         ret <8 x i16> %tmp3
 }
 
@@ -1885,7 +1885,7 @@
 ;CHECK: sli.4s v0, {{v[0-9]+}}, #1
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
+        %tmp3 = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
         ret <4 x i32> %tmp3
 }
 
@@ -1894,19 +1894,19 @@
 ;CHECK: sli.2d v0, {{v[0-9]+}}, #1
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
+        %tmp3 = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
         ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
+declare <8 x i8>  @llvm.aarch64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
+declare <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
 
-declare <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
+declare <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
 
 define <1 x i64> @ashr_v1i64(<1 x i64> %a, <1 x i64> %b) {
 ; CHECK-LABEL: ashr_v1i64:
diff --git a/llvm/test/CodeGen/ARM64/vshr.ll b/llvm/test/CodeGen/AArch64/arm64-vshr.ll
similarity index 95%
rename from llvm/test/CodeGen/ARM64/vshr.ll
rename to llvm/test/CodeGen/AArch64/arm64-vshr.ll
index 1da8f60..21eb579 100644
--- a/llvm/test/CodeGen/ARM64/vshr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshr.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -march=arm64 -aarch64-neon-syntax=apple < %s -mcpu=cyclone | FileCheck %s
 
 define <8 x i16> @testShiftRightArith_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
 ; CHECK-LABEL: testShiftRightArith_v8i16:
diff --git a/llvm/test/CodeGen/ARM64/vshuffle.ll b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/vshuffle.ll
rename to llvm/test/CodeGen/AArch64/arm64-vshuffle.ll
diff --git a/llvm/test/CodeGen/AArch64/arm64-vsqrt.ll b/llvm/test/CodeGen/AArch64/arm64-vsqrt.ll
new file mode 100644
index 0000000..02b7c7e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-vsqrt.ll
@@ -0,0 +1,232 @@
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_2s:
+;CHECK: frecps.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_4s:
+;CHECK: frecps.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frecps_2d:
+;CHECK: frecps.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2s:
+;CHECK: frsqrts.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_4s:
+;CHECK: frsqrts.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2d:
+;CHECK: frsqrts.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_2s:
+;CHECK: frecpe.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_4s:
+;CHECK: frecpe.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frecpe_2d:
+;CHECK: frecpe.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> %tmp1)
+	ret <2 x double> %tmp3
+}
+
+define float @frecpe_s(float* %A) nounwind {
+;CHECK-LABEL: frecpe_s:
+;CHECK: frecpe s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.aarch64.neon.frecpe.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frecpe_d(double* %A) nounwind {
+;CHECK-LABEL: frecpe_d:
+;CHECK: frecpe d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.aarch64.neon.frecpe.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frecpe.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpe.f64(double) nounwind readnone
+
+define float @frecpx_s(float* %A) nounwind {
+;CHECK-LABEL: frecpx_s:
+;CHECK: frecpx s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.aarch64.neon.frecpx.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frecpx_d(double* %A) nounwind {
+;CHECK-LABEL: frecpx_d:
+;CHECK: frecpx d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.aarch64.neon.frecpx.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare float @llvm.aarch64.neon.frecpx.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frecpx.f64(double) nounwind readnone
+
+define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2s:
+;CHECK: frsqrte.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp3 = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_4s:
+;CHECK: frsqrte.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp3 = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2d:
+;CHECK: frsqrte.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp3 = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> %tmp1)
+	ret <2 x double> %tmp3
+}
+
+define float @frsqrte_s(float* %A) nounwind {
+;CHECK-LABEL: frsqrte_s:
+;CHECK: frsqrte s0, {{s[0-9]+}}
+  %tmp1 = load float* %A
+  %tmp3 = call float @llvm.aarch64.neon.frsqrte.f32(float %tmp1)
+  ret float %tmp3
+}
+
+define double @frsqrte_d(double* %A) nounwind {
+;CHECK-LABEL: frsqrte_d:
+;CHECK: frsqrte d0, {{d[0-9]+}}
+  %tmp1 = load double* %A
+  %tmp3 = call double @llvm.aarch64.neon.frsqrte.f64(double %tmp1)
+  ret double %tmp3
+}
+
+declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrte.f32(float) nounwind readnone
+declare double @llvm.aarch64.neon.frsqrte.f64(double) nounwind readnone
+
+define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_2s:
+;CHECK: urecpe.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_4s:
+;CHECK: urecpe.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
+
+define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_2s:
+;CHECK: ursqrte.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_4s:
+;CHECK: ursqrte.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
+
+define float @f1(float %a, float %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f1:
+; CHECK: frsqrts s0, s0, s1
+; CHECK-NEXT: ret
+  %vrsqrtss.i = tail call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) nounwind
+  ret float %vrsqrtss.i
+}
+
+define double @f2(double %a, double %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f2:
+; CHECK: frsqrts d0, d0, d1
+; CHECK-NEXT: ret
+  %vrsqrtsd.i = tail call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) nounwind
+  ret double %vrsqrtsd.i
+}
+
+declare double @llvm.aarch64.neon.frsqrts.f64(double, double) nounwind readnone
+declare float @llvm.aarch64.neon.frsqrts.f32(float, float) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vsra.ll b/llvm/test/CodeGen/AArch64/arm64-vsra.ll
similarity index 97%
rename from llvm/test/CodeGen/ARM64/vsra.ll
rename to llvm/test/CodeGen/AArch64/arm64-vsra.ll
index 3611eb3..5e9cef3 100644
--- a/llvm/test/CodeGen/ARM64/vsra.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vsra.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vsras8:
diff --git a/llvm/test/CodeGen/ARM64/vsub.ll b/llvm/test/CodeGen/AArch64/arm64-vsub.ll
similarity index 84%
rename from llvm/test/CodeGen/ARM64/vsub.ll
rename to llvm/test/CodeGen/AArch64/arm64-vsub.ll
index 5c7e84f..c2c8755 100644
--- a/llvm/test/CodeGen/ARM64/vsub.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vsub.ll
@@ -1,11 +1,11 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: subhn8b:
 ;CHECK: subhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@
 ;CHECK: subhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@
 ;CHECK: subhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -31,8 +31,8 @@
 ;CHECK-LABEL: subhn2_16b:
 ;CHECK: subhn.8b
 ;CHECK-NEXT: subhn2.16b
-  %vsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -41,8 +41,8 @@
 ;CHECK-LABEL: subhn2_8h:
 ;CHECK: subhn.4h
 ;CHECK-NEXT: subhn2.8h
-  %vsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -51,22 +51,22 @@
 ;CHECK-LABEL: subhn2_4s:
 ;CHECK: subhn.2s
 ;CHECK-NEXT: subhn2.4s
-  %vsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: rsubhn8b:
 ;CHECK: rsubhn.8b
         %tmp1 = load <8 x i16>* %A
         %tmp2 = load <8 x i16>* %B
-        %tmp3 = call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp3 = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
         ret <8 x i8> %tmp3
 }
 
@@ -75,7 +75,7 @@
 ;CHECK: rsubhn.4h
         %tmp1 = load <4 x i32>* %A
         %tmp2 = load <4 x i32>* %B
-        %tmp3 = call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp3 = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
         ret <4 x i16> %tmp3
 }
 
@@ -84,7 +84,7 @@
 ;CHECK: rsubhn.2s
         %tmp1 = load <2 x i64>* %A
         %tmp2 = load <2 x i64>* %B
-        %tmp3 = call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        %tmp3 = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
         ret <2 x i32> %tmp3
 }
 
@@ -92,8 +92,8 @@
 ;CHECK-LABEL: rsubhn2_16b:
 ;CHECK: rsubhn.8b
 ;CHECK-NEXT: rsubhn2.16b
-  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
-  %vrsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vrsubhn2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vrsubhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %res
 }
@@ -102,8 +102,8 @@
 ;CHECK-LABEL: rsubhn2_8h:
 ;CHECK: rsubhn.4h
 ;CHECK-NEXT: rsubhn2.8h
-  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
-  %vrsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vrsubhn2.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vrsubhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
@@ -112,15 +112,15 @@
 ;CHECK-LABEL: rsubhn2_4s:
 ;CHECK: rsubhn.2s
 ;CHECK-NEXT: rsubhn2.4s
-  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
-  %vrsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vrsubhn2.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vrsubhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %res
 }
 
-declare <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: ssubl8h:
diff --git a/llvm/test/CodeGen/ARM64/weak-reference.ll b/llvm/test/CodeGen/AArch64/arm64-weak-reference.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/weak-reference.ll
rename to llvm/test/CodeGen/AArch64/arm64-weak-reference.ll
diff --git a/llvm/test/CodeGen/ARM64/xaluo.ll b/llvm/test/CodeGen/AArch64/arm64-xaluo.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/xaluo.ll
rename to llvm/test/CodeGen/AArch64/arm64-xaluo.ll
diff --git a/llvm/test/CodeGen/ARM64/zero-cycle-regmov.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/zero-cycle-regmov.ll
rename to llvm/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll
diff --git a/llvm/test/CodeGen/ARM64/zero-cycle-zeroing.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/zero-cycle-zeroing.ll
rename to llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
diff --git a/llvm/test/CodeGen/ARM64/zext.ll b/llvm/test/CodeGen/AArch64/arm64-zext.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/zext.ll
rename to llvm/test/CodeGen/AArch64/arm64-zext.ll
diff --git a/llvm/test/CodeGen/ARM64/zextload-unscaled.ll b/llvm/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
similarity index 100%
rename from llvm/test/CodeGen/ARM64/zextload-unscaled.ll
rename to llvm/test/CodeGen/AArch64/arm64-zextload-unscaled.ll
diff --git a/llvm/test/CodeGen/ARM64/zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll
similarity index 97%
rename from llvm/test/CodeGen/ARM64/zip.ll
rename to llvm/test/CodeGen/AArch64/arm64-zip.ll
index d06a9f8..304b280 100644
--- a/llvm/test/CodeGen/ARM64/zip.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -aarch64-neon-syntax=apple | FileCheck %s
 
 define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK-LABEL: vzipi8:
diff --git a/llvm/test/CodeGen/AArch64/atomic-ops-not-barriers.ll b/llvm/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
index 162430b..da095a0 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops-not-barriers.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 define i32 @foo(i32* %var, i1 %cond) {
 ; CHECK-LABEL: foo:
diff --git a/llvm/test/CodeGen/AArch64/atomic-ops.ll b/llvm/test/CodeGen/AArch64/atomic-ops.ll
index 58ea735..58b5d1d 100644
--- a/llvm/test/CodeGen/AArch64/atomic-ops.ll
+++ b/llvm/test/CodeGen/AArch64/atomic-ops.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-REG
 
 
 ; Point of CHECK-REG is to make sure UNPREDICTABLE instructions aren't created
@@ -501,9 +501,9 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
 
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
 ; CHECK-NEXT: cbnz [[STATUS]], .LBB{{[0-9]+}}_1
@@ -525,9 +525,9 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
 
 
 ; CHECK-NEXT: stlxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -550,8 +550,8 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, le
 
 
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -574,8 +574,8 @@
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, le
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, le
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -598,9 +598,9 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: sxtb w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
 
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -623,9 +623,9 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
-; CHECK-ARM64-NEXT: cmp w[[OLD_EXT]], w0, sxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: sxth w[[OLD_EXT:[0-9]+]], w[[OLD]]
+; CHECK-NEXT: cmp w[[OLD_EXT]], w0, sxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
 
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -648,8 +648,8 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, gt
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -672,8 +672,8 @@
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, gt
 
 
 ; CHECK-NEXT: stxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -696,8 +696,8 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
 
 
 ; CHECK-NEXT: stxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -720,8 +720,8 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
 
 
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -744,8 +744,8 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, ls
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -768,8 +768,8 @@
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, ls
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, ls
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -792,8 +792,8 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxtb
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: cmp w[[OLD]], w0, uxtb
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
 
 ; CHECK-NEXT: stlxrb [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -816,8 +816,8 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0, uxth
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: cmp w[[OLD]], w0, uxth
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
 
 ; CHECK-NEXT: stxrh [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -840,8 +840,8 @@
   ; w0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 
-; CHECK-ARM64-NEXT: cmp w[[OLD]], w0
-; CHECK-ARM64-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
+; CHECK-NEXT: cmp w[[OLD]], w0
+; CHECK-NEXT: csel [[NEW:w[0-9]+]], w[[OLD]], w0, hi
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
@@ -864,8 +864,8 @@
   ; x0 below is a reasonable guess but could change: it certainly comes into the
   ; function there.
 
-; CHECK-ARM64-NEXT: cmp x[[OLD]], x0
-; CHECK-ARM64-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
+; CHECK-NEXT: cmp x[[OLD]], x0
+; CHECK-NEXT: csel [[NEW:x[0-9]+]], x[[OLD]], x0, hi
 
 
 ; CHECK-NEXT: stlxr [[STATUS:w[0-9]+]], [[NEW]], [x[[ADDR]]]
diff --git a/llvm/test/CodeGen/AArch64/basic-pic.ll b/llvm/test/CodeGen/AArch64/basic-pic.ll
index 2c69bee..62d41bc 100644
--- a/llvm/test/CodeGen/AArch64/basic-pic.ll
+++ b/llvm/test/CodeGen/AArch64/basic-pic.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
 
 @var = global i32 0
 
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert-0.ll b/llvm/test/CodeGen/AArch64/bitfield-insert-0.ll
index 8959e1b..da0ed8a 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert-0.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert-0.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -disassemble - | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -filetype=obj -o - %s | llvm-objdump -disassemble - | FileCheck %s
 
 ; The encoding of lsb -> immr in the CGed bitfield instructions was wrong at one
 ; point, in the edge case where lsb = 0. Just make sure.
diff --git a/llvm/test/CodeGen/AArch64/bitfield-insert.ll b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
index 8b0b4da..2369a55 100644
--- a/llvm/test/CodeGen/AArch64/bitfield-insert.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield-insert.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefix=CHECK
 
 ; First, a simple example from Clang. The registers could plausibly be
 ; different, but probably won't be.
@@ -64,7 +64,7 @@
 ; CHECK-LABEL: test_whole32_from64:
 
 
-; CHECK-ARM64: bfxil {{x[0-9]+}}, {{x[0-9]+}}, #0, #16
+; CHECK: bfxil {{x[0-9]+}}, {{x[0-9]+}}, #0, #16
 
 ; CHECK: ret
 
@@ -83,7 +83,7 @@
 define void @test_32bit_masked(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_masked:
 
-; CHECK-ARM64: and
+; CHECK: and
 ; CHECK: bfi [[INSERT:w[0-9]+]], {{w[0-9]+}}, #3, #4
 
   %oldval = load volatile i32* %existing
@@ -101,7 +101,7 @@
 
 define void @test_64bit_masked(i64 *%existing, i64 *%new) {
 ; CHECK-LABEL: test_64bit_masked:
-; CHECK-ARM64: and
+; CHECK: and
 ; CHECK: bfi [[INSERT:x[0-9]+]], {{x[0-9]+}}, #40, #8
 
   %oldval = load volatile i64* %existing
@@ -121,7 +121,7 @@
 define void @test_32bit_complexmask(i32 *%existing, i32 *%new) {
 ; CHECK-LABEL: test_32bit_complexmask:
 
-; CHECK-ARM64: and
+; CHECK: and
 ; CHECK: bfi {{w[0-9]+}}, {{w[0-9]+}}, #3, #4
 
   %oldval = load volatile i32* %existing
diff --git a/llvm/test/CodeGen/AArch64/bitfield.ll b/llvm/test/CodeGen/AArch64/bitfield.ll
index 71ffe30..0e12653 100644
--- a/llvm/test/CodeGen/AArch64/bitfield.ll
+++ b/llvm/test/CodeGen/AArch64/bitfield.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -23,7 +23,7 @@
 
   %uxt64 = zext i8 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK-ARM64: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xff
   ret void
 }
 
@@ -47,7 +47,7 @@
 
   %uxt64 = zext i16 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK-ARM64: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
+; CHECK: and {{x[0-9]+}}, {{x[0-9]+}}, #0xffff
   ret void
 }
 
@@ -60,7 +60,7 @@
 
   %uxt64 = zext i32 %var to i64
   store volatile i64 %uxt64, i64* @var64
-; CHECK-ARM64: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
+; CHECK: ubfx {{x[0-9]+}}, {{x[0-9]+}}, #0, #32
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/blockaddress.ll b/llvm/test/CodeGen/AArch64/blockaddress.ll
index 0cbdd39..1eec4cc 100644
--- a/llvm/test/CodeGen/AArch64/blockaddress.ll
+++ b/llvm/test/CodeGen/AArch64/blockaddress.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -code-model=large -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 @addr = global i8* null
 
diff --git a/llvm/test/CodeGen/AArch64/bool-loads.ll b/llvm/test/CodeGen/AArch64/bool-loads.ll
index 5d92ef6..881aeaa 100644
--- a/llvm/test/CodeGen/AArch64/bool-loads.ll
+++ b/llvm/test/CodeGen/AArch64/bool-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
 @var = global i1 0
 
diff --git a/llvm/test/CodeGen/AArch64/breg.ll b/llvm/test/CodeGen/AArch64/breg.ll
index 137173b..591f483 100644
--- a/llvm/test/CodeGen/AArch64/breg.ll
+++ b/llvm/test/CodeGen/AArch64/breg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @stored_label = global i8* null
 
diff --git a/llvm/test/CodeGen/AArch64/callee-save.ll b/llvm/test/CodeGen/AArch64/callee-save.ll
index 9b04a8f9..046e6ce 100644
--- a/llvm/test/CodeGen/AArch64/callee-save.ll
+++ b/llvm/test/CodeGen/AArch64/callee-save.ll
@@ -1,19 +1,14 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 
 @var = global float 0.0
 
 define void @foo() {
 ; CHECK-LABEL: foo:
 
-; CHECK: stp d14, d15, [sp
-; CHECK: stp d12, d13, [sp
-; CHECK: stp d10, d11, [sp
-; CHECK: stp d8, d9, [sp
-
-; CHECK-ARM64: stp d15, d14, [sp
-; CHECK-ARM64: stp d13, d12, [sp
-; CHECK-ARM64: stp d11, d10, [sp
-; CHECK-ARM64: stp d9, d8, [sp
+; CHECK: stp d15, d14, [sp
+; CHECK: stp d13, d12, [sp
+; CHECK: stp d11, d10, [sp
+; CHECK: stp d9, d8, [sp
 
   ; Create lots of live variables to exhaust the supply of
   ; caller-saved registers
@@ -83,14 +78,9 @@
   store volatile float %val31, float* @var
   store volatile float %val32, float* @var
 
-; CHECK: ldp     d8, d9, [sp
-; CHECK: ldp     d10, d11, [sp
-; CHECK: ldp     d12, d13, [sp
-; CHECK: ldp     d14, d15, [sp
-
-; CHECK-ARM64: ldp     d9, d8, [sp
-; CHECK-ARM64: ldp     d11, d10, [sp
-; CHECK-ARM64: ldp     d13, d12, [sp
-; CHECK-ARM64: ldp     d15, d14, [sp
+; CHECK: ldp     d9, d8, [sp
+; CHECK: ldp     d11, d10, [sp
+; CHECK: ldp     d13, d12, [sp
+; CHECK: ldp     d15, d14, [sp
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/code-model-large-abs.ll b/llvm/test/CodeGen/AArch64/code-model-large-abs.ll
index 0408e6f..ca92500 100644
--- a/llvm/test/CodeGen/AArch64/code-model-large-abs.ll
+++ b/llvm/test/CodeGen/AArch64/code-model-large-abs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -code-model=large -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -o - %s | FileCheck %s
 
 @var8 = global i8 0
 @var16 = global i16 0
diff --git a/llvm/test/CodeGen/AArch64/compare-branch.ll b/llvm/test/CodeGen/AArch64/compare-branch.ll
index accbadd..a1a87cf 100644
--- a/llvm/test/CodeGen/AArch64/compare-branch.ll
+++ b/llvm/test/CodeGen/AArch64/compare-branch.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/llvm/test/CodeGen/AArch64/complex-copy-noneon.ll b/llvm/test/CodeGen/AArch64/complex-copy-noneon.ll
index f65b116..4ae5478 100644
--- a/llvm/test/CodeGen/AArch64/complex-copy-noneon.ll
+++ b/llvm/test/CodeGen/AArch64/complex-copy-noneon.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=-neon < %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s
 
 ; The DAG combiner decided to use a vector load/store for this struct copy
 ; previously. This probably shouldn't happen without NEON, but the most
diff --git a/llvm/test/CodeGen/AArch64/cond-sel.ll b/llvm/test/CodeGen/AArch64/cond-sel.ll
index 96e11b1..5f81cba 100644
--- a/llvm/test/CodeGen/AArch64/cond-sel.ll
+++ b/llvm/test/CodeGen/AArch64/cond-sel.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var32 = global i32 0
 @var64 = global i64 0
@@ -45,7 +45,7 @@
 ; CHECK-NOFP-NOT: fcmp
   %val2 = select i1 %tst2, i64 9, i64 15
   store i64 %val2, i64* @var64
-; CHECK-ARM64: orr w[[CONST15:[0-9]+]], wzr, #0xf
+; CHECK: orr w[[CONST15:[0-9]+]], wzr, #0xf
 ; CHECK: movz {{[wx]}}[[CONST9:[0-9]+]], #{{9|0x9}}
 ; CHECK: csel [[MAYBETRUE:x[0-9]+]], x[[CONST9]], x[[CONST15]], eq
 ; CHECK: csel {{x[0-9]+}}, x[[CONST9]], [[MAYBETRUE]], vs
diff --git a/llvm/test/CodeGen/AArch64/directcond.ll b/llvm/test/CodeGen/AArch64/directcond.ll
index 832a010..1b51928 100644
--- a/llvm/test/CodeGen/AArch64/directcond.ll
+++ b/llvm/test/CodeGen/AArch64/directcond.ll
@@ -1,10 +1,10 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 define i32 @test_select_i32(i1 %bit, i32 %a, i32 %b) {
 ; CHECK-LABEL: test_select_i32:
   %val = select i1 %bit, i32 %a, i32 %b
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: csel w0, w1, w2, ne
 
   ret i32 %val
@@ -13,7 +13,7 @@
 define i64 @test_select_i64(i1 %bit, i64 %a, i64 %b) {
 ; CHECK-LABEL: test_select_i64:
   %val = select i1 %bit, i64 %a, i64 %b
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: csel x0, x1, x2, ne
 
   ret i64 %val
@@ -22,7 +22,7 @@
 define float @test_select_float(i1 %bit, float %a, float %b) {
 ; CHECK-LABEL: test_select_float:
   %val = select i1 %bit, float %a, float %b
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: fcsel s0, s0, s1, ne
 ; CHECK-NOFP-NOT: fcsel
   ret float %val
@@ -31,7 +31,7 @@
 define double @test_select_double(i1 %bit, double %a, double %b) {
 ; CHECK-LABEL: test_select_double:
   %val = select i1 %bit, double %a, double %b
-; CHECK-ARM64: tst w0, #0x1
+; CHECK: tst w0, #0x1
 ; CHECK-NEXT: fcsel d0, d0, d1, ne
 ; CHECK-NOFP-NOT: fcsel
 
diff --git a/llvm/test/CodeGen/AArch64/dp1.ll b/llvm/test/CodeGen/AArch64/dp1.ll
index b09ce36..662b415 100644
--- a/llvm/test/CodeGen/AArch64/dp1.ll
+++ b/llvm/test/CodeGen/AArch64/dp1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/llvm/test/CodeGen/AArch64/eliminate-trunc.ll b/llvm/test/CodeGen/AArch64/eliminate-trunc.ll
index 02a085a..ea86a08 100644
--- a/llvm/test/CodeGen/AArch64/eliminate-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/eliminate-trunc.ll
@@ -1,11 +1,11 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-apple-ios7.0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-apple-ios7.0 -mcpu=cyclone | FileCheck %s
 
 ; Check  trunc i64 operation is translated as a subregister access
 ; eliminating an i32 induction varible.
 
-; CHECK-ARM64-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1
-; CHECK-ARM64: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK-ARM64-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}
+; CHECK-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}
 define void @test1_signed([8 x i8]* nocapture %a, i8* nocapture readonly %box, i8 %limit) minsize {
 entry:
   %conv = zext i8 %limit to i32
diff --git a/llvm/test/CodeGen/AArch64/extern-weak.ll b/llvm/test/CodeGen/AArch64/extern-weak.ll
index 8f41845..ce5c0f6 100644
--- a/llvm/test/CodeGen/AArch64/extern-weak.ll
+++ b/llvm/test/CodeGen/AArch64/extern-weak.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -o - %s | FileCheck %s --check-prefix=CHECK-ARM64
-; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - %s | FileCheck --check-prefix=CHECK-LARGE %s
 
 declare extern_weak i32 @var()
 
@@ -9,8 +9,8 @@
   ret i32()* @var
 
 
-; CHECK-ARM64: adrp x[[ADDRHI:[0-9]+]], :got:var
-; CHECK-ARM64: ldr x0, [x[[ADDRHI]], :got_lo12:var]
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:var
+; CHECK: ldr x0, [x[[ADDRHI]], :got_lo12:var]
 
   ; In the large model, the usual relocations are absolute and can
   ; materialise 0.
@@ -27,9 +27,9 @@
   %addr = getelementptr [10 x i32]* @arr_var, i32 0, i32 5
 
 
-; CHECK-ARM64: adrp x[[ADDRHI:[0-9]+]], :got:arr_var
-; CHECK-ARM64: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
-; CHECK-ARM64: add x0, [[BASE]], #20
+; CHECK: adrp x[[ADDRHI:[0-9]+]], :got:arr_var
+; CHECK: ldr [[BASE:x[0-9]+]], [x[[ADDRHI]], :got_lo12:arr_var]
+; CHECK: add x0, [[BASE]], #20
 
   ret i32* %addr
 
@@ -46,8 +46,8 @@
 define i32* @wibble() {
   ret i32* @defined_weak_var
 
-; CHECK-ARM64: adrp [[BASE:x[0-9]+]], defined_weak_var
-; CHECK-ARM64: add x0, [[BASE]], :lo12:defined_weak_var
+; CHECK: adrp [[BASE:x[0-9]+]], defined_weak_var
+; CHECK: add x0, [[BASE]], :lo12:defined_weak_var
 
 ; CHECK-LARGE: movz x0, #:abs_g3:defined_weak_var
 ; CHECK-LARGE: movk x0, #:abs_g2_nc:defined_weak_var
diff --git a/llvm/test/CodeGen/AArch64/fastcc-reserved.ll b/llvm/test/CodeGen/AArch64/fastcc-reserved.ll
index 09a6ae3..a392619 100644
--- a/llvm/test/CodeGen/AArch64/fastcc-reserved.ll
+++ b/llvm/test/CodeGen/AArch64/fastcc-reserved.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
 
 ; This test is designed to be run in the situation where the
 ; call-frame is not reserved (hence disable-fp-elim), but where
@@ -12,30 +12,22 @@
   %addr = alloca i8, i32 %in
 
 ; Normal frame setup stuff:
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
-; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov     x29, sp
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame:
 ; CHECK: sub sp, sp, #16
-; CHECK-ARM64: sub sp, sp, #16
 
   call fastcc void @will_pop([8 x i32] undef, i32 42)
 ; CHECK: bl will_pop
-; CHECK-ARM64: bl will_pop
 
 ; Since @will_pop is fastcc with tailcallopt, it will put the stack
 ; back where it needs to be, we shouldn't duplicate that
 ; CHECK-NOT: sub sp, sp, #16
 ; CHECK-NOT: add sp, sp,
-; CHECK-ARM64-NOT: sub sp, sp, #16
-; CHECK-ARM64-NOT: add sp, sp,
 
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
-; CHECK-ARM64: mov     sp, x29
-; CHECK-ARM64: ldp     x29, x30, [sp], #16
+; CHECK: mov     sp, x29
+; CHECK: ldp     x29, x30, [sp], #16
   ret void
 }
 
@@ -46,28 +38,21 @@
 
   %addr = alloca i8, i32 %in
 ; Normal frame setup again
-; CHECK: sub sp, sp,
-; CHECK: stp x29, x30
-; CHECK-ARM64: stp     x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov     x29, sp
+; CHECK: stp     x29, x30, [sp, #-16]!
+; CHECK: mov     x29, sp
 
 ; Reserve space for call-frame
 ; CHECK: sub sp, sp, #16
-; CHECK-ARM64: sub sp, sp, #16
 
   call void @wont_pop([8 x i32] undef, i32 42)
 ; CHECK: bl wont_pop
-; CHECK-ARM64: bl wont_pop
 
 ; This time we *do* need to unreserve the call-frame
 ; CHECK: add sp, sp, #16
-; CHECK-ARM64: add sp, sp, #16
 
 ; Check for epilogue (primarily to make sure sp spotted above wasn't
 ; part of it).
-; CHECK: ldp x29, x30
-; CHECK: add sp, sp,
-; CHECK-ARM64: mov     sp, x29
-; CHECK-ARM64: ldp     x29, x30, [sp], #16
+; CHECK: mov     sp, x29
+; CHECK: ldp     x29, x30, [sp], #16
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/fastcc.ll b/llvm/test/CodeGen/AArch64/fastcc.ll
index b641de0..9917fcd 100644
--- a/llvm/test/CodeGen/AArch64/fastcc.ll
+++ b/llvm/test/CodeGen/AArch64/fastcc.ll
@@ -1,226 +1,144 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-ARM64-TAIL
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-ARM64 %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 ; Without tailcallopt fastcc still means the caller cleans up the
 ; stack, so try to make sure this is respected.
 
 define fastcc void @func_stack0() {
 ; CHECK-LABEL: func_stack0:
-; CHECK: sub sp, sp, #48
-
-; CHECK-ARM64-LABEL: func_stack0:
-; CHECK-ARM64: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64-NEXT: mov x29, sp
-; CHECK-ARM64-NEXT: sub sp, sp, #32
+; CHECK: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #32
 
 ; CHECK-TAIL-LABEL: func_stack0:
-; CHECK-TAIL: sub sp, sp, #48
-
-; CHECK-ARM64-TAIL-LABEL: func_stack0:
-; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64-TAIL-NEXT: mov x29, sp
-; CHECK-ARM64-TAIL-NEXT: sub sp, sp, #32
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL-NEXT: mov x29, sp
+; CHECK-TAIL-NEXT: sub sp, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64:  bl func_stack8
-; CHECK-ARM64-NOT: sub sp, sp,
-
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
-; CHECK-ARM64-TAIL: bl func_stack8
-; CHECK-ARM64-TAIL: sub sp, sp, #16
-
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64: bl func_stack32
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
-; CHECK-ARM64-TAIL: bl func_stack32
-; CHECK-ARM64-TAIL: sub sp, sp, #32
-
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
-; CHECK-ARM64: bl func_stack0
-; CHECK-ARM64-NOT: sub sp, sp
 
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
-; CHECK-ARM64-TAIL: bl func_stack0
-; CHECK-ARM64-TAIL-NOT: sub sp, sp
-
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-ARM64: mov sp, x29
-; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #48
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
-
-; CHECK-ARM64-TAIL: mov sp, x29
-; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack8([8 x i32], i32 %stacked) {
 ; CHECK-LABEL: func_stack8:
-; CHECK: sub sp, sp, #48
+; CHECK: stp x29, x30, [sp, #-16]!
+; CHECK: mov x29, sp
+; CHECK: sub sp, sp, #32
 
-; CHECK-ARM64-LABEL: func_stack8:
-; CHECK-ARM64: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64: mov x29, sp
-; CHECK-ARM64: sub sp, sp, #32
 
 ; CHECK-TAIL-LABEL: func_stack8:
-; CHECK-TAIL: sub sp, sp, #48
-
-; CHECK-ARM64-TAIL-LABEL: func_stack8:
-; CHECK-ARM64-TAIL: stp x29, x30, [sp, #-16]!
-; CHECK-ARM64-TAIL: mov x29, sp
-; CHECK-ARM64-TAIL: sub sp, sp, #32
+; CHECK-TAIL: stp x29, x30, [sp, #-16]!
+; CHECK-TAIL: mov x29, sp
+; CHECK-TAIL: sub sp, sp, #32
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64:  bl func_stack8
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
-; CHECK-ARM64-TAIL: bl func_stack8
-; CHECK-ARM64-TAIL: sub sp, sp, #16
-
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64: bl func_stack32
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
-; CHECK-ARM64-TAIL: bl func_stack32
-; CHECK-ARM64-TAIL: sub sp, sp, #32
-
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
-; CHECK-ARM64: bl func_stack0
-; CHECK-ARM64-NOT: sub sp, sp
-
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
-; CHECK-ARM64-TAIL: bl func_stack0
-; CHECK-ARM64-TAIL-NOT: sub sp, sp
-
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-ARM64: mov sp, x29
-; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-NEXT: ret
 
-; CHECK-TAIL: add sp, sp, #64
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
-
-; CHECK-ARM64-TAIL: mov sp, x29
-; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-TAIL-NEXT: ret
 }
 
 define fastcc void @func_stack32([8 x i32], i128 %stacked0, i128 %stacked1) {
 ; CHECK-LABEL: func_stack32:
-; CHECK: sub sp, sp, #48
-
-; CHECK-ARM64-LABEL: func_stack32:
-; CHECK-ARM64: mov x29, sp
+; CHECK: mov x29, sp
 
 ; CHECK-TAIL-LABEL: func_stack32:
-; CHECK-TAIL: sub sp, sp, #48
-
-; CHECK-ARM64-TAIL-LABEL: func_stack32:
-; CHECK-ARM64-TAIL: mov x29, sp
+; CHECK-TAIL: mov x29, sp
 
 
   call fastcc void @func_stack8([8 x i32] undef, i32 42)
 ; CHECK:  bl func_stack8
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64:  bl func_stack8
-; CHECK-ARM64-NOT: sub sp, sp,
-
 ; CHECK-TAIL: bl func_stack8
 ; CHECK-TAIL: sub sp, sp, #16
 
-; CHECK-ARM64-TAIL: bl func_stack8
-; CHECK-ARM64-TAIL: sub sp, sp, #16
-
 
   call fastcc void @func_stack32([8 x i32] undef, i128 0, i128 9)
 ; CHECK: bl func_stack32
 ; CHECK-NOT: sub sp, sp,
 
-; CHECK-ARM64: bl func_stack32
-; CHECK-ARM64-NOT: sub sp, sp,
 
 ; CHECK-TAIL: bl func_stack32
 ; CHECK-TAIL: sub sp, sp, #32
 
-; CHECK-ARM64-TAIL: bl func_stack32
-; CHECK-ARM64-TAIL: sub sp, sp, #32
-
 
   call fastcc void @func_stack0()
 ; CHECK: bl func_stack0
 ; CHECK-NOT: sub sp, sp
 
-; CHECK-ARM64: bl func_stack0
-; CHECK-ARM64-NOT: sub sp, sp
 
 ; CHECK-TAIL: bl func_stack0
 ; CHECK-TAIL-NOT: sub sp, sp
 
-; CHECK-ARM64-TAIL: bl func_stack0
-; CHECK-ARM64-TAIL-NOT: sub sp, sp
-
   ret void
-; CHECK: add sp, sp, #48
+; CHECK: mov sp, x29
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-NEXT: ret
 
-; CHECK-ARM64: mov sp, x29
-; CHECK-ARM64-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-NEXT: ret
-
-; CHECK-TAIL: add sp, sp, #80
+; CHECK-TAIL: mov sp, x29
+; CHECK-TAIL-NEXT: ldp     x29, x30, [sp], #16
 ; CHECK-TAIL-NEXT: ret
-
-; CHECK-ARM64-TAIL: mov sp, x29
-; CHECK-ARM64-TAIL-NEXT: ldp     x29, x30, [sp], #16
-; CHECK-ARM64-TAIL-NEXT: ret
 }
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index c54e3e6..3c74508 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 declare void @bar(i32)
 
diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
index 40800d0..ccb3616 100644
--- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-apple-ios7.0 -O0
 
 ; (The O0 test is to make sure FastISel still constrains its operands properly
diff --git a/llvm/test/CodeGen/AArch64/flags-multiuse.ll b/llvm/test/CodeGen/AArch64/flags-multiuse.ll
index 667c05d..c9b0b9f 100644
--- a/llvm/test/CodeGen/AArch64/flags-multiuse.ll
+++ b/llvm/test/CodeGen/AArch64/flags-multiuse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 ; LLVM should be able to cope with multiple uses of the same flag-setting
 ; instruction at different points of a routine. Either by rematerializing the
diff --git a/llvm/test/CodeGen/AArch64/floatdp_2source.ll b/llvm/test/CodeGen/AArch64/floatdp_2source.ll
index 8e98b78..2622717 100644
--- a/llvm/test/CodeGen/AArch64/floatdp_2source.ll
+++ b/llvm/test/CodeGen/AArch64/floatdp_2source.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu -mcpu=cyclone | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu -mcpu=cyclone | FileCheck %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/llvm/test/CodeGen/AArch64/fp-cond-sel.ll b/llvm/test/CodeGen/AArch64/fp-cond-sel.ll
index 07cbb49..b4f4d77 100644
--- a/llvm/test/CodeGen/AArch64/fp-cond-sel.ll
+++ b/llvm/test/CodeGen/AArch64/fp-cond-sel.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
@@ -12,7 +12,7 @@
   %tst1 = icmp ugt i32 %lhs32, %rhs32
   %val1 = select i1 %tst1, float 0.0, float 1.0
   store float %val1, float* @varfloat
-; CHECK-ARM64: movi v[[FLT0:[0-9]+]].2d, #0
+; CHECK: movi v[[FLT0:[0-9]+]].2d, #0
 ; CHECK: fmov s[[FLT1:[0-9]+]], #1.0
 ; CHECK: fcsel {{s[0-9]+}}, s[[FLT0]], s[[FLT1]], hi
 
diff --git a/llvm/test/CodeGen/AArch64/fp-dp3.ll b/llvm/test/CodeGen/AArch64/fp-dp3.ll
index 53113b5..10f88fd 100644
--- a/llvm/test/CodeGen/AArch64/fp-dp3.ll
+++ b/llvm/test/CodeGen/AArch64/fp-dp3.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -fp-contract=fast | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
 ; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s -check-prefix=CHECK-NOFAST
 
 declare float @llvm.fma.f32(float, float, float)
diff --git a/llvm/test/CodeGen/AArch64/fp128-folding.ll b/llvm/test/CodeGen/AArch64/fp128-folding.ll
index 4b19deb..892b19c 100644
--- a/llvm/test/CodeGen/AArch64/fp128-folding.ll
+++ b/llvm/test/CodeGen/AArch64/fp128-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 declare void @bar(i8*, i8*, i32*)
 
 ; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
diff --git a/llvm/test/CodeGen/AArch64/fpimm.ll b/llvm/test/CodeGen/AArch64/fpimm.ll
index e279d5b..e59520c 100644
--- a/llvm/test/CodeGen/AArch64/fpimm.ll
+++ b/llvm/test/CodeGen/AArch64/fpimm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @varf32 = global float 0.0
 @varf64 = global double 0.0
diff --git a/llvm/test/CodeGen/AArch64/func-argpassing.ll b/llvm/test/CodeGen/AArch64/func-argpassing.ll
index 129ab25..abb732c 100644
--- a/llvm/test/CodeGen/AArch64/func-argpassing.ll
+++ b/llvm/test/CodeGen/AArch64/func-argpassing.ll
@@ -1,8 +1,5 @@
-
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM64 %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE --check-prefix=CHECK-ARM64-BE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -63,7 +60,7 @@
 
     %val0 = load volatile i32* %addr0
     ; Some weird move means x0 is used for one access
-; CHECK-ARM64: ldr [[REG32:w[0-9]+]], [sp, #28]
+; CHECK: ldr [[REG32:w[0-9]+]], [sp, #28]
     store i32 %val0, i32* @var32
 ; CHECK: str [[REG32]], [{{x[0-9]+}}, {{#?}}:lo12:var32]
 
@@ -149,7 +146,6 @@
     %retval = load volatile i32* %stacked
     ret i32 %retval
 ; CHECK-LE: ldr w0, [sp, #16]
-; CHECK-BE-AARCH64: ldr w0, [sp, #20]
 }
 
 define void @stacked_fpu(float %var0, double %var1, float %var2, float %var3,
@@ -159,8 +155,8 @@
     store float %var8, float* @varfloat
     ; Beware as above: the offset would be different on big-endian
     ; machines if the first ldr were changed to use s-registers.
-; CHECK-ARM64: ldr {{[ds]}}[[VALFLOAT:[0-9]+]], [sp]
-; CHECK-ARM64: str s[[VALFLOAT]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
+; CHECK: ldr {{[ds]}}[[VALFLOAT:[0-9]+]], [sp]
+; CHECK: str s[[VALFLOAT]], [{{x[0-9]+}}, {{#?}}:lo12:varfloat]
 
     ret void
 }
@@ -185,11 +181,10 @@
     ; Nothing local on stack in current codegen, so first stack is 16 away
 ; CHECK-LE: add     x[[REG:[0-9]+]], sp, #16
 ; CHECK-LE: ldr {{x[0-9]+}}, [x[[REG]], #8]
-; CHECK-BE-AARCH64: ldr {{x[0-9]+}}, [sp, #24]
 
     ; Important point is that we address sp+24 for second dword
 
-; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
     ret void
 }
 
@@ -209,6 +204,5 @@
                         i32 %val4, i32 %val5, i32 %val6, i32 %val7,
                         i16 %stack1) {
 ; CHECK-LABEL: stacked_i16
-; CHECK-ARM64-BE: ldrh
   ret i16 %stack1
 }
diff --git a/llvm/test/CodeGen/AArch64/func-calls.ll b/llvm/test/CodeGen/AArch64/func-calls.ll
index 8cb5f97..422c576 100644
--- a/llvm/test/CodeGen/AArch64/func-calls.ll
+++ b/llvm/test/CodeGen/AArch64/func-calls.ll
@@ -1,8 +1,7 @@
-
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM64-NONEON %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=arm64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -90,13 +89,13 @@
   ; that varstruct is passed on the stack. Rather dependent on how a
   ; memcpy gets created, but the following works for now.
 
-; CHECK-ARM64-DAG: str {{q[0-9]+}}, [sp]
-; CHECK-ARM64-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
-; CHECK-ARM64: mov v0.16b, v[[FINAL_DOUBLE]].16b
+; CHECK-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK: mov v0.16b, v[[FINAL_DOUBLE]].16b
 
-; CHECK-ARM64-NONEON-DAG: str {{q[0-9]+}}, [sp]
-; CHECK-ARM64-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
-; CHECK-ARM64-NONEON: fmov d0, d[[FINAL_DOUBLE]]
+; CHECK-NONEON-DAG: str {{q[0-9]+}}, [sp]
+; CHECK-NONEON-DAG: fmov d[[FINAL_DOUBLE:[0-9]+]], #1.0
+; CHECK-NONEON: fmov d0, d[[FINAL_DOUBLE]]
 
 ; CHECK: bl struct_on_stack
 ; CHECK-NOFP-NOT: fmov
@@ -105,11 +104,11 @@
                          float -2.0, float -8.0, float 16.0, float 1.0,
                          float 64.0)
 
-; CHECK-ARM64:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
-; CHECK-ARM64: str [[SIXTY_FOUR]], [sp]
+; CHECK:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK: str [[SIXTY_FOUR]], [sp]
 
-; CHECK-ARM64-NONEON:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
-; CHECK-ARM64-NONEON: str [[SIXTY_FOUR]], [sp]
+; CHECK-NONEON:  movz [[SIXTY_FOUR:w[0-9]+]], #0x4280, lsl #16
+; CHECK-NONEON: str [[SIXTY_FOUR]], [sp]
 
 ; CHECK: bl stacked_fpu
   ret void
@@ -131,8 +130,11 @@
                                    i32 42, i128 %val)
 ; CHECK: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, {{#?}}:lo12:var128]
 ; CHECK: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
-; CHECK-ARM64: stp [[I128LO]], [[I128HI]], [sp, #16]
-; CHECK-ARM64-NONEON: stp [[I128LO]], [[I128HI]], [sp, #16]
+; CHECK: stp [[I128LO]], [[I128HI]], [sp, #16]
+
+; CHECK-NONEON: ldr [[I128LO:x[0-9]+]], [{{x[0-9]+}}, :lo12:var128]
+; CHECK-NONEON: ldr [[I128HI:x[0-9]+]], [{{x[0-9]+}}, #8]
+; CHECK-NONEON: stp [[I128LO]], [[I128HI]], [sp, #16]
 ; CHECK: bl check_i128_stackalign
 
   call void @check_i128_regalign(i32 0, i128 42)
diff --git a/llvm/test/CodeGen/AArch64/global-alignment.ll b/llvm/test/CodeGen/AArch64/global-alignment.ll
index 2bf4a2c..451b9d6 100644
--- a/llvm/test/CodeGen/AArch64/global-alignment.ll
+++ b/llvm/test/CodeGen/AArch64/global-alignment.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 @var32 = global [3 x i32] zeroinitializer
 @var64 = global [3 x i64] zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/got-abuse.ll b/llvm/test/CodeGen/AArch64/got-abuse.ll
index c23edaf..7a02b10 100644
--- a/llvm/test/CodeGen/AArch64/got-abuse.ll
+++ b/llvm/test/CodeGen/AArch64/got-abuse.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj -o - %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj -o - %s
 
 ; LLVM gives well-defined semantics to this horrible construct (though C says
 ; it's undefined). Regardless, we shouldn't crash. The important feature here is
diff --git a/llvm/test/CodeGen/AArch64/illegal-float-ops.ll b/llvm/test/CodeGen/AArch64/illegal-float-ops.ll
index 8320f3a..9f7dd99 100644
--- a/llvm/test/CodeGen/AArch64/illegal-float-ops.ll
+++ b/llvm/test/CodeGen/AArch64/illegal-float-ops.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/llvm/test/CodeGen/AArch64/init-array.ll b/llvm/test/CodeGen/AArch64/init-array.ll
index d3ed363..f47b490 100644
--- a/llvm/test/CodeGen/AArch64/init-array.ll
+++ b/llvm/test/CodeGen/AArch64/init-array.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
-; RUN: llc -mtriple=arm64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array -o - %s | FileCheck %s
 
 define internal void @_GLOBAL__I_a() section ".text.startup" {
   ret void
diff --git a/llvm/test/CodeGen/AArch64/inline-asm-constraints-badI.ll b/llvm/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
index 7ca9ade..9d833d9 100644
--- a/llvm/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
+++ b/llvm/test/CodeGen/AArch64/inline-asm-constraints-badI.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=arm64-none-linux-gnu -o - %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
 
 define void @foo() {
   ; Out of range immediate for I.
diff --git a/llvm/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll b/llvm/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
index 6bc6338..1726013 100644
--- a/llvm/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
+++ b/llvm/test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
@@ -1,4 +1,4 @@
-; RUN: not llc -mtriple=arm64-none-linux-gnu -o - %s
+; RUN: not llc -mtriple=aarch64-none-linux-gnu -o - %s
 
 define void @foo() {
   ; 32-bit bitpattern ending in 1101 can't be produced.
diff --git a/llvm/test/CodeGen/AArch64/jump-table.ll b/llvm/test/CodeGen/AArch64/jump-table.ll
index a0fcafa..1dfb789 100644
--- a/llvm/test/CodeGen/AArch64/jump-table.ll
+++ b/llvm/test/CodeGen/AArch64/jump-table.ll
@@ -1,6 +1,6 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -o - %s | FileCheck --check-prefix=CHECK-PIC %s
 
 define i32 @test_jumptable(i32 %in) {
 ; CHECK: test_jumptable
diff --git a/llvm/test/CodeGen/AArch64/large-consts.ll b/llvm/test/CodeGen/AArch64/large-consts.ll
index b5f6c32..6bf85e8 100644
--- a/llvm/test/CodeGen/AArch64/large-consts.ll
+++ b/llvm/test/CodeGen/AArch64/large-consts.ll
@@ -1,14 +1,14 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s -code-model=large -show-mc-encoding | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s -code-model=large -show-mc-encoding | FileCheck %s
 
 ; Make sure the shift amount is encoded into the instructions by LLVM because
 ; it's not the linker's job to put it there.
 
 define double @foo() {
 
-; CHECK-ARM64: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
-; CHECK-ARM64: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2]
-; CHECK-ARM64: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b101AAAAA,0xf2]
-; CHECK-ARM64: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xf2]
+; CHECK: movz [[CPADDR:x[0-9]+]], #:abs_g3:.LCPI0_0   // encoding: [0bAAA01000,A,0b111AAAAA,0xd2]
+; CHECK: movk [[CPADDR]], #:abs_g2_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b110AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g1_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b101AAAAA,0xf2]
+; CHECK: movk [[CPADDR]], #:abs_g0_nc:.LCPI0_0 // encoding: [0bAAA01000,A,0b100AAAAA,0xf2]
 
   ret double 3.14159
 }
diff --git a/llvm/test/CodeGen/AArch64/ldst-regoffset.ll b/llvm/test/CodeGen/AArch64/ldst-regoffset.ll
index b13634c..e2fa08b 100644
--- a/llvm/test/CodeGen/AArch64/ldst-regoffset.ll
+++ b/llvm/test/CodeGen/AArch64/ldst-regoffset.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
diff --git a/llvm/test/CodeGen/AArch64/ldst-unscaledimm.ll b/llvm/test/CodeGen/AArch64/ldst-unscaledimm.ll
index d738cfd..1de8443 100644
--- a/llvm/test/CodeGen/AArch64/ldst-unscaledimm.ll
+++ b/llvm/test/CodeGen/AArch64/ldst-unscaledimm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
diff --git a/llvm/test/CodeGen/AArch64/ldst-unsignedimm.ll b/llvm/test/CodeGen/AArch64/ldst-unsignedimm.ll
index d6475f9..e171d22 100644
--- a/llvm/test/CodeGen/AArch64/ldst-unsignedimm.ll
+++ b/llvm/test/CodeGen/AArch64/ldst-unsignedimm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
 @var_8bit = global i8 0
 @var_16bit = global i16 0
diff --git a/llvm/test/CodeGen/ARM64/lit.local.cfg b/llvm/test/CodeGen/AArch64/lit.local.cfg
similarity index 89%
rename from llvm/test/CodeGen/ARM64/lit.local.cfg
rename to llvm/test/CodeGen/AArch64/lit.local.cfg
index 3468e27..77493d8 100644
--- a/llvm/test/CodeGen/ARM64/lit.local.cfg
+++ b/llvm/test/CodeGen/AArch64/lit.local.cfg
@@ -3,7 +3,7 @@
 config.suffixes = ['.ll']
 
 targets = set(config.root.targets_to_build.split())
-if not 'ARM64' in targets:
+if not 'AArch64' in targets:
     config.unsupported = True
 
 # For now we don't test arm64-win32.
diff --git a/llvm/test/CodeGen/AArch64/literal_pools_float.ll b/llvm/test/CodeGen/AArch64/literal_pools_float.ll
index 6f9f3fc..e53b8b6 100644
--- a/llvm/test/CodeGen/AArch64/literal_pools_float.ll
+++ b/llvm/test/CodeGen/AArch64/literal_pools_float.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -mcpu=cyclone | FileCheck %s
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-none-linux-gnu -code-model=large -mcpu=cyclone | FileCheck --check-prefix=CHECK-LARGE %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -code-model=large -mcpu=cyclone | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
 
 @varfloat = global float 0.0
 @vardouble = global double 0.0
diff --git a/llvm/test/CodeGen/AArch64/local_vars.ll b/llvm/test/CodeGen/AArch64/local_vars.ll
index 4518fa2..2f5b9f2 100644
--- a/llvm/test/CodeGen/AArch64/local_vars.ll
+++ b/llvm/test/CodeGen/AArch64/local_vars.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP-ARM64 %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP-ARM64 %s
 
 ; Make sure a reasonably sane prologue and epilogue are
 ; generated. This test is not robust in the face of an frame-handling
diff --git a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
index 608d44f..b249d72 100644
--- a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var1_32 = global i32 0
 @var2_32 = global i32 0
diff --git a/llvm/test/CodeGen/AArch64/mature-mc-support.ll b/llvm/test/CodeGen/AArch64/mature-mc-support.ll
index 2948da9..276c54d 100644
--- a/llvm/test/CodeGen/AArch64/mature-mc-support.ll
+++ b/llvm/test/CodeGen/AArch64/mature-mc-support.ll
@@ -1,14 +1,10 @@
 ; Test that inline assembly is parsed by the MC layer when MC support is mature
 ; (even when the output is assembly).
 
-; RUN: FileCheck %s < %t1
-
-; RUN: FileCheck %s < %t2
-
-; RUN: not llc -mtriple=arm64-pc-linux < %s > /dev/null 2> %t3
+; RUN: not llc -mtriple=aarch64-pc-linux < %s > /dev/null 2> %t3
 ; RUN: FileCheck %s < %t3
 
-; RUN: not llc -mtriple=arm64-pc-linux -filetype=obj < %s > /dev/null 2> %t4
+; RUN: not llc -mtriple=aarch64-pc-linux -filetype=obj < %s > /dev/null 2> %t4
 ; RUN: FileCheck %s < %t4
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
diff --git a/llvm/test/CodeGen/AArch64/movw-consts.ll b/llvm/test/CodeGen/AArch64/movw-consts.ll
index 6fe0009..93c1812 100644
--- a/llvm/test/CodeGen/AArch64/movw-consts.ll
+++ b/llvm/test/CodeGen/AArch64/movw-consts.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefix=CHECK
 
 define i64 @test0() {
 ; CHECK-LABEL: test0:
@@ -9,43 +9,43 @@
 
 define i64 @test1() {
 ; CHECK-LABEL: test1:
-; CHECK-ARM64: orr w0, wzr, #0x1
+; CHECK: orr w0, wzr, #0x1
   ret i64 1
 }
 
 define i64 @test2() {
 ; CHECK-LABEL: test2:
-; CHECK-ARM64: orr w0, wzr, #0xffff
+; CHECK: orr w0, wzr, #0xffff
   ret i64 65535
 }
 
 define i64 @test3() {
 ; CHECK-LABEL: test3:
-; CHECK-ARM64: orr w0, wzr, #0x10000
+; CHECK: orr w0, wzr, #0x10000
   ret i64 65536
 }
 
 define i64 @test4() {
 ; CHECK-LABEL: test4:
-; CHECK-ARM64: orr w0, wzr, #0xffff0000
+; CHECK: orr w0, wzr, #0xffff0000
   ret i64 4294901760
 }
 
 define i64 @test5() {
 ; CHECK-LABEL: test5:
-; CHECK-ARM64: orr x0, xzr, #0x100000000
+; CHECK: orr x0, xzr, #0x100000000
   ret i64 4294967296
 }
 
 define i64 @test6() {
 ; CHECK-LABEL: test6:
-; CHECK-ARM64: orr x0, xzr, #0xffff00000000
+; CHECK: orr x0, xzr, #0xffff00000000
   ret i64 281470681743360
 }
 
 define i64 @test7() {
 ; CHECK-LABEL: test7:
-; CHECK-ARM64: orr x0, xzr, #0x1000000000000
+; CHECK: orr x0, xzr, #0x1000000000000
   ret i64 281474976710656
 }
 
@@ -75,35 +75,35 @@
 
 define void @test11() {
 ; CHECK-LABEL: test11:
-; CHECK-ARM64: str wzr
+; CHECK: str wzr
   store i32 0, i32* @var32
   ret void
 }
 
 define void @test12() {
 ; CHECK-LABEL: test12:
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0x1
+; CHECK: orr {{w[0-9]+}}, wzr, #0x1
   store i32 1, i32* @var32
   ret void
 }
 
 define void @test13() {
 ; CHECK-LABEL: test13:
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0xffff
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff
   store i32 65535, i32* @var32
   ret void
 }
 
 define void @test14() {
 ; CHECK-LABEL: test14:
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0x10000
+; CHECK: orr {{w[0-9]+}}, wzr, #0x10000
   store i32 65536, i32* @var32
   ret void
 }
 
 define void @test15() {
 ; CHECK-LABEL: test15:
-; CHECK-ARM64: orr {{w[0-9]+}}, wzr, #0xffff0000
+; CHECK: orr {{w[0-9]+}}, wzr, #0xffff0000
   store i32 4294901760, i32* @var32
   ret void
 }
@@ -119,6 +119,6 @@
 ; CHECK-LABEL: test17:
 
   ; Mustn't MOVN w0 here.
-; CHECK-ARM64: orr x0, xzr, #0xfffffffffffffffd
+; CHECK: orr x0, xzr, #0xfffffffffffffffd
   ret i64 -3
 }
diff --git a/llvm/test/CodeGen/AArch64/movw-shift-encoding.ll b/llvm/test/CodeGen/AArch64/movw-shift-encoding.ll
index 2fe9dd4..178fccc 100644
--- a/llvm/test/CodeGen/AArch64/movw-shift-encoding.ll
+++ b/llvm/test/CodeGen/AArch64/movw-shift-encoding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu < %s -show-mc-encoding -code-model=large | FileCheck %s --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-linux-gnu < %s -show-mc-encoding -code-model=large | FileCheck %s
 
 @var = global i32 0
 
@@ -8,8 +8,8 @@
 define i32* @get_var() {
   ret i32* @var
 
-; CHECK-ARM64: movz    x0, #:abs_g3:var        // encoding: [0bAAA00000,A,0b111AAAAA,0xd2]
-; CHECK-ARM64: movk    x0, #:abs_g2_nc:var     // encoding: [0bAAA00000,A,0b110AAAAA,0xf2]
-; CHECK-ARM64: movk    x0, #:abs_g1_nc:var     // encoding: [0bAAA00000,A,0b101AAAAA,0xf2]
-; CHECK-ARM64: movk    x0, #:abs_g0_nc:var     // encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
+; CHECK: movz    x0, #:abs_g3:var        // encoding: [0bAAA00000,A,0b111AAAAA,0xd2]
+; CHECK: movk    x0, #:abs_g2_nc:var     // encoding: [0bAAA00000,A,0b110AAAAA,0xf2]
+; CHECK: movk    x0, #:abs_g1_nc:var     // encoding: [0bAAA00000,A,0b101AAAAA,0xf2]
+; CHECK: movk    x0, #:abs_g0_nc:var     // encoding: [0bAAA00000,A,0b100AAAAA,0xf2]
 }
diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
index b70cda3..61099d4 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
 
 ; From <8 x i8>
 
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index dfaf1f2..6497856 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: and8xi8:
diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
index b99057e..6d89dfb 100644
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
 ; CHECK-LABEL: cmeq8xi8:
diff --git a/llvm/test/CodeGen/AArch64/neon-diagnostics.ll b/llvm/test/CodeGen/AArch64/neon-diagnostics.ll
index e28df29..099b685 100644
--- a/llvm/test/CodeGen/AArch64/neon-diagnostics.ll
+++ b/llvm/test/CodeGen/AArch64/neon-diagnostics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
 ; CHECK: test_vfma_lane_f32:
diff --git a/llvm/test/CodeGen/AArch64/neon-extract.ll b/llvm/test/CodeGen/AArch64/neon-extract.ll
index 96b4084..f270b54 100644
--- a/llvm/test/CodeGen/AArch64/neon-extract.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extract.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: test_vext_s8:
diff --git a/llvm/test/CodeGen/AArch64/neon-fma.ll b/llvm/test/CodeGen/AArch64/neon-fma.ll
index 6df494d..af70302 100644
--- a/llvm/test/CodeGen/AArch64/neon-fma.ll
+++ b/llvm/test/CodeGen/AArch64/neon-fma.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
 ;CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
diff --git a/llvm/test/CodeGen/AArch64/neon-fpround_f128.ll b/llvm/test/CodeGen/AArch64/neon-fpround_f128.ll
index e48dbba..a93f3f2 100644
--- a/llvm/test/CodeGen/AArch64/neon-fpround_f128.ll
+++ b/llvm/test/CodeGen/AArch64/neon-fpround_f128.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <1 x double> @test_fpround_v1f128(<1 x fp128>* %a) {
 ; CHECK-LABEL: test_fpround_v1f128:
diff --git a/llvm/test/CodeGen/AArch64/neon-idiv.ll b/llvm/test/CodeGen/AArch64/neon-idiv.ll
index 11e1af7..de402c4 100644
--- a/llvm/test/CodeGen/AArch64/neon-idiv.ll
+++ b/llvm/test/CodeGen/AArch64/neon-idiv.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -mattr=+neon | FileCheck %s
 
 define <4 x i32> @test1(<4 x i32> %a) {
   %rem = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
diff --git a/llvm/test/CodeGen/AArch64/neon-mla-mls.ll b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll
index e7bff74..71bb0e7 100644
--- a/llvm/test/CodeGen/AArch64/neon-mla-mls.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 
 define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll
index b7baf25..40649ae 100644
--- a/llvm/test/CodeGen/AArch64/neon-mov.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mov.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
 define <8 x i8> @movi8b() {
 ; CHECK-LABEL: movi8b:
@@ -14,75 +14,75 @@
 
 define <2 x i32> @movi2s_lsl0() {
 ; CHECK-LABEL: movi2s_lsl0:
-; CHECK-ARM64: movi {{d[0-9]+}}, #0x0000ff000000ff
+; CHECK: movi {{d[0-9]+}}, #0x0000ff000000ff
    ret <2 x i32> < i32 255, i32 255 >
 }
 
 define <2 x i32> @movi2s_lsl8() {
 ; CHECK-LABEL: movi2s_lsl8:
-; CHECK-ARM64: movi {{d[0-9]+}}, #0x00ff000000ff00
+; CHECK: movi {{d[0-9]+}}, #0x00ff000000ff00
    ret <2 x i32> < i32 65280, i32 65280 >
 }
 
 define <2 x i32> @movi2s_lsl16() {
 ; CHECK-LABEL: movi2s_lsl16:
-; CHECK-ARM64: movi {{d[0-9]+}}, #0xff000000ff0000
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff0000
    ret <2 x i32> < i32 16711680, i32 16711680 >
 
 }
 
 define <2 x i32> @movi2s_lsl24() {
 ; CHECK-LABEL: movi2s_lsl24:
-; CHECK-ARM64: movi {{d[0-9]+}}, #0xff000000ff000000
+; CHECK: movi {{d[0-9]+}}, #0xff000000ff000000
    ret <2 x i32> < i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i32> @movi4s_lsl0() {
 ; CHECK-LABEL: movi4s_lsl0:
-; CHECK-ARM64: movi {{v[0-9]+}}.2d, #0x0000ff000000ff
+; CHECK: movi {{v[0-9]+}}.2d, #0x0000ff000000ff
    ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 >
 }
 
 define <4 x i32> @movi4s_lsl8() {
 ; CHECK-LABEL: movi4s_lsl8:
-; CHECK-ARM64: movi {{v[0-9]+}}.2d, #0x00ff000000ff00
+; CHECK: movi {{v[0-9]+}}.2d, #0x00ff000000ff00
    ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 >
 }
 
 define <4 x i32> @movi4s_lsl16() {
 ; CHECK-LABEL: movi4s_lsl16:
-; CHECK-ARM64:  movi {{v[0-9]+}}.2d, #0xff000000ff0000
+; CHECK:  movi {{v[0-9]+}}.2d, #0xff000000ff0000
    ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 >
 
 }
 
 define <4 x i32> @movi4s_lsl24() {
 ; CHECK-LABEL: movi4s_lsl24:
-; CHECK-ARM64:  movi {{v[0-9]+}}.2d, #0xff000000ff000000
+; CHECK:  movi {{v[0-9]+}}.2d, #0xff000000ff000000
    ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 >
 }
 
 define <4 x i16> @movi4h_lsl0() {
 ; CHECK-LABEL: movi4h_lsl0:
-; CHECK-ARM64:  movi {{d[0-9]+}}, #0xff00ff00ff00ff
+; CHECK:  movi {{d[0-9]+}}, #0xff00ff00ff00ff
    ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <4 x i16> @movi4h_lsl8() {
 ; CHECK-LABEL: movi4h_lsl8:
-; CHECK-ARM64: movi d0, #0xff00ff00ff00ff00
+; CHECK: movi d0, #0xff00ff00ff00ff00
    ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
 define <8 x i16> @movi8h_lsl0() {
 ; CHECK-LABEL: movi8h_lsl0:
-; CHECK-ARM64: movi v0.2d, #0xff00ff00ff00ff
+; CHECK: movi v0.2d, #0xff00ff00ff00ff
    ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
 }
 
 define <8 x i16> @movi8h_lsl8() {
 ; CHECK-LABEL: movi8h_lsl8:
-; CHECK-ARM64: movi v0.2d, #0xff00ff00ff00ff00
+; CHECK: movi v0.2d, #0xff00ff00ff00ff00
    ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
 }
 
@@ -164,26 +164,26 @@
 
 define <2 x i32> @movi2s_msl8(<2 x i32> %a) {
 ; CHECK-LABEL: movi2s_msl8:
-; CHECK-ARM64: movi {{d[0-9]+}}, #0x00ffff0000ffff
+; CHECK: movi {{d[0-9]+}}, #0x00ffff0000ffff
 	ret <2 x i32> < i32 65535, i32 65535 >
 }
 
 define <2 x i32> @movi2s_msl16() {
 ; CHECK-LABEL: movi2s_msl16:
-; CHECK-ARM64:  movi d0, #0xffffff00ffffff
+; CHECK:  movi d0, #0xffffff00ffffff
    ret <2 x i32> < i32 16777215, i32 16777215 >
 }
 
 
 define <4 x i32> @movi4s_msl8() {
 ; CHECK-LABEL: movi4s_msl8:
-; CHECK-ARM64:  movi v0.2d, #0x00ffff0000ffff
+; CHECK:  movi v0.2d, #0x00ffff0000ffff
    ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 >
 }
 
 define <4 x i32> @movi4s_msl16() {
 ; CHECK-LABEL: movi4s_msl16:
-; CHECK-ARM64:  movi v0.2d, #0xffffff00ffffff
+; CHECK:  movi v0.2d, #0xffffff00ffffff
    ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 >
 }
 
diff --git a/llvm/test/CodeGen/AArch64/neon-or-combine.ll b/llvm/test/CodeGen/AArch64/neon-or-combine.ll
index d98c128..260f693 100644
--- a/llvm/test/CodeGen/AArch64/neon-or-combine.ll
+++ b/llvm/test/CodeGen/AArch64/neon-or-combine.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; Check that the DAGCombiner does not crash with an assertion failure
 ; when performing a target specific combine to simplify a 'or' dag node
diff --git a/llvm/test/CodeGen/AArch64/neon-perm.ll b/llvm/test/CodeGen/AArch64/neon-perm.ll
index d45dde6..4f8571d 100644
--- a/llvm/test/CodeGen/AArch64/neon-perm.ll
+++ b/llvm/test/CodeGen/AArch64/neon-perm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefix=CHECK
 
 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
@@ -53,7 +53,7 @@
 
 define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp1_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -69,7 +69,7 @@
 
 define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp1q_s64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -109,7 +109,7 @@
 
 define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp1_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -125,7 +125,7 @@
 
 define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp1q_u64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -133,7 +133,7 @@
 
 define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vuzp1_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
@@ -149,7 +149,7 @@
 
 define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vuzp1q_f64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
@@ -221,7 +221,7 @@
 
 define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp2_s32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -237,7 +237,7 @@
 
 define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp2q_s64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -277,7 +277,7 @@
 
 define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp2_u32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -293,7 +293,7 @@
 
 define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vuzp2q_u64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -301,7 +301,7 @@
 
 define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vuzp2_f32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
@@ -317,7 +317,7 @@
 
 define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vuzp2q_f64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
@@ -389,7 +389,7 @@
 
 define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip1_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -405,7 +405,7 @@
 
 define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip1q_s64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -445,7 +445,7 @@
 
 define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip1_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -461,7 +461,7 @@
 
 define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip1q_u64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -469,7 +469,7 @@
 
 define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vzip1_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
@@ -485,7 +485,7 @@
 
 define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vzip1q_f64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
@@ -557,7 +557,7 @@
 
 define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip2_s32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -573,7 +573,7 @@
 
 define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip2q_s64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -613,7 +613,7 @@
 
 define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip2_u32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -629,7 +629,7 @@
 
 define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vzip2q_u64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -637,7 +637,7 @@
 
 define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vzip2_f32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
@@ -653,7 +653,7 @@
 
 define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vzip2q_f64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
@@ -725,7 +725,7 @@
 
 define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn1_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -741,7 +741,7 @@
 
 define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn1q_s64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -781,7 +781,7 @@
 
 define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn1_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i32> %shuffle.i
@@ -797,7 +797,7 @@
 
 define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn1q_u64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x i64> %shuffle.i
@@ -805,7 +805,7 @@
 
 define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vtrn1_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x float> %shuffle.i
@@ -821,7 +821,7 @@
 
 define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vtrn1q_f64:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
   ret <2 x double> %shuffle.i
@@ -893,7 +893,7 @@
 
 define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn2_s32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -909,7 +909,7 @@
 
 define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn2q_s64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -949,7 +949,7 @@
 
 define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn2_u32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %shuffle.i
@@ -965,7 +965,7 @@
 
 define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_vtrn2q_u64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %shuffle.i
@@ -973,7 +973,7 @@
 
 define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vtrn2_f32:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %shuffle.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %shuffle.i
@@ -989,7 +989,7 @@
 
 define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) {
 ; CHECK-LABEL: test_vtrn2q_f64:
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 entry:
   %shuffle.i = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %shuffle.i
@@ -2494,8 +2494,8 @@
 
 define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2530,8 +2530,8 @@
 
 define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vuzp_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2542,8 +2542,8 @@
 
 define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vuzp_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vuzp.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vuzp1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2710,8 +2710,8 @@
 
 define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2746,8 +2746,8 @@
 
 define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vzip_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2758,8 +2758,8 @@
 
 define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vzip_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vzip.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vzip1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
@@ -2926,8 +2926,8 @@
 
 define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn_s32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2962,8 +2962,8 @@
 
 define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: test_vtrn_u32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
@@ -2974,8 +2974,8 @@
 
 define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) {
 ; CHECK-LABEL: test_vtrn_f32:
-; CHECK-ARM64: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-; CHECK-ARM64: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip1 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: zip2 {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 entry:
   %vtrn.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
   %vtrn1.i = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
index 6cfdc5b..32f5962 100644
--- a/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ b/llvm/test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 declare float @llvm.fma.f32(float, float, float)
 declare double @llvm.fma.f64(double, double, double)
diff --git a/llvm/test/CodeGen/AArch64/neon-scalar-copy.ll b/llvm/test/CodeGen/AArch64/neon-scalar-copy.ll
index ab7ea66..a01df32 100644
--- a/llvm/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ b/llvm/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefix=CHECK
 
 
 define float @test_dup_sv2S(<2 x float> %v) {
  ; CHECK-LABEL: test_dup_sv2S
- ; CHECK-ARM64: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
+ ; CHECK: ins {{v[0-9]+}}.s[0], {{v[0-9]+}}.s[1]
  %tmp1 = extractelement <2 x float> %v, i32 1
  ret float  %tmp1
 }
@@ -37,14 +37,14 @@
 
 define double @test_dup_dv2D(<2 x double> %v) {
  ; CHECK-LABEL: test_dup_dv2D
- ; CHECK-ARM64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
 }
 
 define double @test_dup_dv2D_0(<2 x double> %v) {
  ; CHECK-LABEL: test_dup_dv2D_0
- ; CHECK-ARM64: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
+ ; CHECK: ins {{v[0-9]+}}.d[0], {{v[0-9]+}}.d[1]
  ; CHECK: ret
  %tmp1 = extractelement <2 x double> %v, i32 1
  ret double  %tmp1
@@ -88,7 +88,7 @@
 
 define <1 x i64> @test_vector_dup_dv2D(<2 x i64> %v1) {
  ; CHECK-LABEL: test_vector_dup_dv2D
- ; CHECK-ARM64: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
+ ; CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
  %shuffle.i = shufflevector <2 x i64> %v1, <2 x i64> undef, <1 x i32> <i32 1> 
  ret <1 x i64> %shuffle.i
 }
diff --git a/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll b/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll
index 1d9c92c..d10d551 100644
--- a/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll
+++ b/llvm/test/CodeGen/AArch64/neon-shift-left-long.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i16> @test_sshll_v8i8(<8 x i8> %a) {
 ; CHECK: test_sshll_v8i8:
diff --git a/llvm/test/CodeGen/AArch64/neon-truncStore-extLoad.ll b/llvm/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
index f15cd24..1df3719 100644
--- a/llvm/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
+++ b/llvm/test/CodeGen/AArch64/neon-truncStore-extLoad.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; A vector TruncStore can not be selected.
 ; Test a trunc IR and a vector store IR can be selected correctly.
diff --git a/llvm/test/CodeGen/AArch64/pic-eh-stubs.ll b/llvm/test/CodeGen/AArch64/pic-eh-stubs.ll
index d269791..e8c7625 100644
--- a/llvm/test/CodeGen/AArch64/pic-eh-stubs.ll
+++ b/llvm/test/CodeGen/AArch64/pic-eh-stubs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 ; RUN: llc -mtriple=arm64_be-none-linux-gnu -relocation-model=pic -o - %s | FileCheck %s
 
 ; Make sure exception-handling PIC code can be linked correctly. An alternative
diff --git a/llvm/test/CodeGen/AArch64/regress-f128csel-flags.ll b/llvm/test/CodeGen/AArch64/regress-f128csel-flags.ll
index 313cdb1..25b5e0c 100644
--- a/llvm/test/CodeGen/AArch64/regress-f128csel-flags.ll
+++ b/llvm/test/CodeGen/AArch64/regress-f128csel-flags.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 ; We used to not mark NZCV as being used in the continuation basic-block
 ; when lowering a 128-bit "select" to branches. This meant a subsequent use
diff --git a/llvm/test/CodeGen/AArch64/regress-fp128-livein.ll b/llvm/test/CodeGen/AArch64/regress-fp128-livein.ll
index 141c0d8..5e6ab0a 100644
--- a/llvm/test/CodeGen/AArch64/regress-fp128-livein.ll
+++ b/llvm/test/CodeGen/AArch64/regress-fp128-livein.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s
 
 ; Regression test for NZCV reg live-in not being added to fp128csel IfTrue BB,
 ; causing a crash during live range calc.
diff --git a/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll b/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll
index 55c3bcd..477d996 100644
--- a/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/llvm/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefix CHECK-ARM64
+; RUN: llc -verify-machineinstrs -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
 
 ; When generating DAG selection tables, TableGen used to only flag an
 ; instruction as needing a chain on its own account if it had a built-in pattern
@@ -12,7 +12,7 @@
 declare void @bar(i8*)
 
 define i64 @test_chains() {
-; CHECK-ARM64-LABEL: test_chains:
+; CHECK-LABEL: test_chains:
 
   %locvar = alloca i8
 
@@ -25,13 +25,13 @@
   %inc.4 = trunc i64 %inc.3 to i8
   store i8 %inc.4, i8* %locvar
 
-; CHECK-ARM64: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
-; CHECK-ARM64: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK-ARM64: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
-; CHECK-ARM64: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
+; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
+; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
 
   %ret.1 = load i8* %locvar
   %ret.2 = zext i8 %ret.1 to i64
   ret i64 %ret.2
-; CHECK-ARM64: ret
+; CHECK: ret
 }
diff --git a/llvm/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll b/llvm/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
index cc42b0c..c3167e4 100644
--- a/llvm/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
+++ b/llvm/test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
 @var = global i32 0
 
 declare void @bar()
diff --git a/llvm/test/CodeGen/AArch64/setcc-takes-i32.ll b/llvm/test/CodeGen/AArch64/setcc-takes-i32.ll
index f06c8ec..ec86159 100644
--- a/llvm/test/CodeGen/AArch64/setcc-takes-i32.ll
+++ b/llvm/test/CodeGen/AArch64/setcc-takes-i32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-none-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -o - %s | FileCheck %s
 
 ; Most important point here is that the promotion of the i1 works
 ; correctly. Previously LLVM thought that i64 was the appropriate SetCC output,
diff --git a/llvm/test/CodeGen/AArch64/sibling-call.ll b/llvm/test/CodeGen/AArch64/sibling-call.ll
index 8524571..34e3bb4 100644
--- a/llvm/test/CodeGen/AArch64/sibling-call.ll
+++ b/llvm/test/CodeGen/AArch64/sibling-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -arm64-load-store-opt=0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -aarch64-load-store-opt=0 | FileCheck %s
 
 declare void @callee_stack0()
 declare void @callee_stack8([8 x i32], i64)
diff --git a/llvm/test/CodeGen/AArch64/sincos-expansion.ll b/llvm/test/CodeGen/AArch64/sincos-expansion.ll
index 5ba1d8d..c3a172d 100644
--- a/llvm/test/CodeGen/AArch64/sincos-expansion.ll
+++ b/llvm/test/CodeGen/AArch64/sincos-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
 
 define float @test_sincos_f32(float %f) {
   %sin = call float @sinf(float %f) readnone
diff --git a/llvm/test/CodeGen/AArch64/sincospow-vector-expansion.ll b/llvm/test/CodeGen/AArch64/sincospow-vector-expansion.ll
index 38c8bb2..22f33a8 100644
--- a/llvm/test/CodeGen/AArch64/sincospow-vector-expansion.ll
+++ b/llvm/test/CodeGen/AArch64/sincospow-vector-expansion.ll
@@ -1,4 +1,4 @@
-; RUN: llc -o - %s -verify-machineinstrs -mtriple=arm64-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc -o - %s -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+neon | FileCheck %s
 
 
 define <2 x float> @test_cos_v2f64(<2 x double> %v1) {
diff --git a/llvm/test/CodeGen/AArch64/tail-call.ll b/llvm/test/CodeGen/AArch64/tail-call.ll
index b3841fa..8aab842 100644
--- a/llvm/test/CodeGen/AArch64/tail-call.ll
+++ b/llvm/test/CodeGen/AArch64/tail-call.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=arm64-none-linux-gnu -tailcallopt | FileCheck --check-prefix=CHECK-ARM64 %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
 
 declare fastcc void @callee_stack0()
 declare fastcc void @callee_stack8([8 x i32], i64)
@@ -8,91 +8,59 @@
 ; CHECK-LABEL: caller_to0_from0:
 ; CHECK-NEXT: // BB
 
-; CHECK-ARM64-LABEL: caller_to0_from0:
-; CHECK-ARM64-NEXT: // BB
-
   tail call fastcc void @callee_stack0()
   ret void
 
 ; CHECK-NEXT: b callee_stack0
-
-; CHECK-ARM64-NEXT: b callee_stack0
 }
 
 define fastcc void @caller_to0_from8([8 x i32], i64) {
 ; CHECK-LABEL: caller_to0_from8:
 
-; CHECK-ARM64-LABEL: caller_to0_from8:
-
   tail call fastcc void @callee_stack0()
   ret void
 
 ; CHECK: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack0
-
-; CHECK-ARM64: add sp, sp, #16
-; CHECK-ARM64-NEXT: b callee_stack0
 }
 
 define fastcc void @caller_to8_from0() {
 ; CHECK-LABEL: caller_to8_from0:
 ; CHECK: sub sp, sp, #32
 
-; CHECK-ARM64-LABEL: caller_to8_from0:
-; CHECK-ARM64: sub sp, sp, #32
-
 ; Key point is that the "42" should go #16 below incoming stack
 ; pointer (we didn't have arg space to reuse).
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
 
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
-
-; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
-; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 define fastcc void @caller_to8_from8([8 x i32], i64 %a) {
 ; CHECK-LABEL: caller_to8_from8:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to8_from8:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Key point is that the "%a" should go where at SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
 
-; CHECK: str {{x[0-9]+}}, [sp, #16]
-; CHECK-NEXT: add sp, sp, #16
+; CHECK: str {{x[0-9]+}}, [sp, #16]!
 ; CHECK-NEXT: b callee_stack8
-
-; CHECK-ARM64: str {{x[0-9]+}}, [sp, #16]!
-; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 define fastcc void @caller_to16_from8([8 x i32], i64 %a) {
 ; CHECK-LABEL: caller_to16_from8:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to16_from8:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Important point is that the call reuses the "dead" argument space
 ; above %a on the stack. If it tries to go below incoming-SP then the
 ; callee will not deallocate the space, even in fastcc.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 42, i64 2)
 
-; CHECK: str {{x[0-9]+}}, [sp, #24]
-; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack16
-
-; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-ARM64-NEXT: add sp, sp, #16
-; CHECK-ARM64-NEXT: b callee_stack16
   ret void
 }
 
@@ -101,19 +69,12 @@
 ; CHECK-LABEL: caller_to8_from24:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to8_from24:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Key point is that the "%a" should go where at #16 above SP on entry.
   tail call fastcc void @callee_stack8([8 x i32] undef, i64 42)
   ret void
 
-; CHECK: str {{x[0-9]+}}, [sp, #32]
-; CHECK-NEXT: add sp, sp, #32
+; CHECK: str {{x[0-9]+}}, [sp, #32]!
 ; CHECK-NEXT: b callee_stack8
-
-; CHECK-ARM64: str {{x[0-9]+}}, [sp, #32]!
-; CHECK-ARM64-NEXT: b callee_stack8
 }
 
 
@@ -121,24 +82,13 @@
 ; CHECK-LABEL: caller_to16_from16:
 ; CHECK: sub sp, sp, #16
 
-; CHECK-ARM64-LABEL: caller_to16_from16:
-; CHECK-ARM64: sub sp, sp, #16
-
 ; Here we want to make sure that both loads happen before the stores:
 ; otherwise either %a or %b will be wrongly clobbered.
   tail call fastcc void @callee_stack16([8 x i32] undef, i64 %b, i64 %a)
   ret void
 
-; CHECK: ldr x0,
-; CHECK: ldr x1,
-; CHECK: str x1,
-; CHECK: str x0,
-
+; CHECK: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
 ; CHECK-NEXT: add sp, sp, #16
 ; CHECK-NEXT: b callee_stack16
-
-; CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK-ARM64-NEXT: add sp, sp, #16
-; CHECK-ARM64-NEXT: b callee_stack16
 }
diff --git a/llvm/test/CodeGen/AArch64/zero-reg.ll b/llvm/test/CodeGen/AArch64/zero-reg.ll
index 44072c6..bc112ab 100644
--- a/llvm/test/CodeGen/AArch64/zero-reg.ll
+++ b/llvm/test/CodeGen/AArch64/zero-reg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -o - %s -mtriple=arm64-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs -o - %s -mtriple=aarch64-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
 @var64 = global i64 0
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-aba-abd.ll b/llvm/test/CodeGen/ARM64/aarch64-neon-aba-abd.ll
deleted file mode 100644
index 9d321b2..0000000
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-aba-abd.ll
+++ /dev/null
@@ -1,236 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>)
-declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: uabd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %abd
-}
-
-define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_uaba_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-  %aba = add <8 x i8> %lhs, %abd
-; CHECK: uaba v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %aba
-}
-
-define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_sabd_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: sabd v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %abd
-}
-
-define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; CHECK: test_saba_v8i8:
-  %abd = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-  %aba = add <8 x i8> %lhs, %abd
-; CHECK: saba v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %aba
-}
-
-declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>)
-declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: uabd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %abd
-}
-
-define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_uaba_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-  %aba = add <16 x i8> %lhs, %abd
-; CHECK: uaba v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %aba
-}
-
-define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_sabd_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: sabd v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %abd
-}
-
-define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_saba_v16i8:
-  %abd = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-  %aba = add <16 x i8> %lhs, %abd
-; CHECK: saba v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %aba
-}
-
-declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>)
-declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: uabd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %abd
-}
-
-define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_uaba_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-  %aba = add <4 x i16> %lhs, %abd
-; CHECK: uaba v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %aba
-}
-
-define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_sabd_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: sabd v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %abd
-}
-
-define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_saba_v4i16:
-  %abd = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-  %aba = add <4 x i16> %lhs, %abd
-; CHECK: saba v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %aba
-}
-
-declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>)
-declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: uabd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %abd
-}
-
-define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_uaba_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-  %aba = add <8 x i16> %lhs, %abd
-; CHECK: uaba v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %aba
-}
-
-define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_sabd_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: sabd v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %abd
-}
-
-define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_saba_v8i16:
-  %abd = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-  %aba = add <8 x i16> %lhs, %abd
-; CHECK: saba v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %aba
-}
-
-declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>)
-declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: uabd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %abd
-}
-
-define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_uaba_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-  %aba = add <2 x i32> %lhs, %abd
-; CHECK: uaba v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %aba
-}
-
-define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_sabd_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: sabd v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %abd
-}
-
-define <2 x i32> @test_sabd_v2i32_const() {
-; CHECK: test_sabd_v2i32_const:
-; CHECK: movi     d1, #0x00ffffffff0000
-; CHECK-NEXT: sabd v0.2s, v0.2s, v1.2s
-  %1 = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(
-    <2 x i32> <i32 -2147483648, i32 2147450880>,
-    <2 x i32> <i32 -65536, i32 65535>)
-  ret <2 x i32> %1
-}
-
-define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_saba_v2i32:
-  %abd = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-  %aba = add <2 x i32> %lhs, %abd
-; CHECK: saba v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %aba
-}
-
-declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: uabd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %abd
-}
-
-define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_uaba_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-  %aba = add <4 x i32> %lhs, %abd
-; CHECK: uaba v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %aba
-}
-
-define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_sabd_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: sabd v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %abd
-}
-
-define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_saba_v4i32:
-  %abd = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-  %aba = add <4 x i32> %lhs, %abd
-; CHECK: saba v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %aba
-}
-
-declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>)
-
-define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_fabd_v2f32:
-  %abd = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: fabd v0.2s, v0.2s, v1.2s
-  ret <2 x float> %abd
-}
-
-declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>)
-
-define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_fabd_v4f32:
-  %abd = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: fabd v0.4s, v0.4s, v1.4s
-  ret <4 x float> %abd
-}
-
-declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>)
-
-define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_fabd_v2f64:
-  %abd = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: fabd v0.2d, v0.2d, v1.2d
-  ret <2 x double> %abd
-}
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-across.ll b/llvm/test/CodeGen/ARM64/aarch64-neon-across.ll
deleted file mode 100644
index 5a3538b..0000000
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-across.ll
+++ /dev/null
@@ -1,460 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-declare float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float>)
-
-declare float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float>)
-
-declare float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float>)
-
-declare float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.uminv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
-
-declare i64 @llvm.arm64.neon.uaddlv.i64.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.uaddlv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>)
-
-declare i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32>)
-
-declare i32 @llvm.arm64.neon.saddlv.i32.v8i16(<8 x i16>)
-
-declare i32 @llvm.arm64.neon.saddlv.i32.v16i8(<16 x i8>)
-
-declare i32 @llvm.arm64.neon.uaddlv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.uaddlv.i32.v8i8(<8 x i8>)
-
-declare i32 @llvm.arm64.neon.saddlv.i32.v4i16(<4 x i16>)
-
-declare i32 @llvm.arm64.neon.saddlv.i32.v8i8(<8 x i8>)
-
-define i16 @test_vaddlv_s8(<8 x i8> %a) {
-; CHECK: test_vaddlv_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %saddlvv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddlv_s16(<4 x i16> %a) {
-; CHECK: test_vaddlv_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v4i16(<4 x i16> %a)
-  ret i32 %saddlvv.i
-}
-
-define i16 @test_vaddlv_u8(<8 x i8> %a) {
-; CHECK: test_vaddlv_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %uaddlvv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddlv_u16(<4 x i16> %a) {
-; CHECK: test_vaddlv_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v4i16(<4 x i16> %a)
-  ret i32 %uaddlvv.i
-}
-
-define i16 @test_vaddlvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_s8:
-; CHECK: saddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %saddlvv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddlvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_s16:
-; CHECK: saddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %saddlvv.i = tail call i32 @llvm.arm64.neon.saddlv.i32.v8i16(<8 x i16> %a)
-  ret i32 %saddlvv.i
-}
-
-define i64 @test_vaddlvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_s32:
-; CHECK: saddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %saddlvv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %a)
-  ret i64 %saddlvv.i
-}
-
-define i16 @test_vaddlvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddlvq_u8:
-; CHECK: uaddlv h{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %uaddlvv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddlvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddlvq_u16:
-; CHECK: uaddlv s{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %uaddlvv.i = tail call i32 @llvm.arm64.neon.uaddlv.i32.v8i16(<8 x i16> %a)
-  ret i32 %uaddlvv.i
-}
-
-define i64 @test_vaddlvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddlvq_u32:
-; CHECK: uaddlv d{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %uaddlvv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v4i32(<4 x i32> %a)
-  ret i64 %uaddlvv.i
-}
-
-define i8 @test_vmaxv_s8(<8 x i8> %a) {
-; CHECK: test_vmaxv_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %smaxv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vmaxv_s16(<4 x i16> %a) {
-; CHECK: test_vmaxv_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %smaxv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vmaxv_u8(<8 x i8> %a) {
-; CHECK: test_vmaxv_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %umaxv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vmaxv_u16(<4 x i16> %a) {
-; CHECK: test_vmaxv_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %umaxv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vmaxvq_s8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_s8:
-; CHECK: smaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %smaxv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vmaxvq_s16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_s16:
-; CHECK: smaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %smaxv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_s32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_s32:
-; CHECK: smaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %smaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a)
-  ret i32 %smaxv.i
-}
-
-define i8 @test_vmaxvq_u8(<16 x i8> %a) {
-; CHECK: test_vmaxvq_u8:
-; CHECK: umaxv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %umaxv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vmaxvq_u16(<8 x i16> %a) {
-; CHECK: test_vmaxvq_u16:
-; CHECK: umaxv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %umaxv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vmaxvq_u32(<4 x i32> %a) {
-; CHECK: test_vmaxvq_u32:
-; CHECK: umaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %umaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i32(<4 x i32> %a)
-  ret i32 %umaxv.i
-}
-
-define i8 @test_vminv_s8(<8 x i8> %a) {
-; CHECK: test_vminv_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %sminv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vminv_s16(<4 x i16> %a) {
-; CHECK: test_vminv_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %sminv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vminv_u8(<8 x i8> %a) {
-; CHECK: test_vminv_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %uminv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vminv_u16(<4 x i16> %a) {
-; CHECK: test_vminv_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %uminv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vminvq_s8(<16 x i8> %a) {
-; CHECK: test_vminvq_s8:
-; CHECK: sminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %sminv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vminvq_s16(<8 x i16> %a) {
-; CHECK: test_vminvq_s16:
-; CHECK: sminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %sminv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vminvq_s32(<4 x i32> %a) {
-; CHECK: test_vminvq_s32:
-; CHECK: sminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %sminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a)
-  ret i32 %sminv.i
-}
-
-define i8 @test_vminvq_u8(<16 x i8> %a) {
-; CHECK: test_vminvq_u8:
-; CHECK: uminv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %uminv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vminvq_u16(<8 x i16> %a) {
-; CHECK: test_vminvq_u16:
-; CHECK: uminv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %uminv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vminvq_u32(<4 x i32> %a) {
-; CHECK: test_vminvq_u32:
-; CHECK: uminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %uminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i32(<4 x i32> %a)
-  ret i32 %uminv.i
-}
-
-define i8 @test_vaddv_s8(<8 x i8> %a) {
-; CHECK: test_vaddv_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vaddv_s16(<4 x i16> %a) {
-; CHECK: test_vaddv_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vaddv_u8(<8 x i8> %a) {
-; CHECK: test_vaddv_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.8b
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vaddv_u16(<4 x i16> %a) {
-; CHECK: test_vaddv_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.4h
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i8 @test_vaddvq_s8(<16 x i8> %a) {
-; CHECK: test_vaddvq_s8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vaddvq_s16(<8 x i16> %a) {
-; CHECK: test_vaddvq_s16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_s32(<4 x i32> %a) {
-; CHECK: test_vaddvq_s32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a)
-  ret i32 %vaddv.i
-}
-
-define i8 @test_vaddvq_u8(<16 x i8> %a) {
-; CHECK: test_vaddvq_u8:
-; CHECK: addv b{{[0-9]+}}, {{v[0-9]+}}.16b
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a)
-  %0 = trunc i32 %vaddv.i to i8
-  ret i8 %0
-}
-
-define i16 @test_vaddvq_u16(<8 x i16> %a) {
-; CHECK: test_vaddvq_u16:
-; CHECK: addv h{{[0-9]+}}, {{v[0-9]+}}.8h
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a)
-  %0 = trunc i32 %vaddv.i to i16
-  ret i16 %0
-}
-
-define i32 @test_vaddvq_u32(<4 x i32> %a) {
-; CHECK: test_vaddvq_u32:
-; CHECK: addv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a)
-  ret i32 %vaddv.i
-}
-
-define float @test_vmaxvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxvq_f32:
-; CHECK: fmaxv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vminvq_f32(<4 x float> %a) {
-; CHECK: test_vminvq_f32:
-; CHECK: fminv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vmaxnmvq_f32(<4 x float> %a) {
-; CHECK: test_vmaxnmvq_f32:
-; CHECK: fmaxnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float> %a)
-  ret float %0
-}
-
-define float @test_vminnmvq_f32(<4 x float> %a) {
-; CHECK: test_vminnmvq_f32:
-; CHECK: fminnmv s{{[0-9]+}}, {{v[0-9]+}}.4s
-entry:
-  %0 = call float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float> %a)
-  ret float %0
-}
-
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-add-pairwise.ll b/llvm/test/CodeGen/ARM64/aarch64-neon-add-pairwise.ll
deleted file mode 100644
index 9cd76ff..0000000
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-add-pairwise.ll
+++ /dev/null
@@ -1,100 +0,0 @@
-; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
-
-declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>)
-
-define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
-; Using registers other than v0, v1 are possible, but would be odd.
-; CHECK: test_addp_v8i8:
-  %tmp1 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
-; CHECK: addp v0.8b, v0.8b, v1.8b
-  ret <8 x i8> %tmp1
-}
-
-declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
-
-define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
-; CHECK: test_addp_v16i8:
-  %tmp1 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
-; CHECK: addp v0.16b, v0.16b, v1.16b
-  ret <16 x i8> %tmp1
-}
-
-declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
-
-define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
-; CHECK: test_addp_v4i16:
-  %tmp1 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
-; CHECK: addp v0.4h, v0.4h, v1.4h
-  ret <4 x i16> %tmp1
-}
-
-declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>)
-
-define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
-; CHECK: test_addp_v8i16:
-  %tmp1 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
-; CHECK: addp v0.8h, v0.8h, v1.8h
-  ret <8 x i16> %tmp1
-}
-
-declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>)
-
-define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
-; CHECK: test_addp_v2i32:
-  %tmp1 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
-; CHECK: addp v0.2s, v0.2s, v1.2s
-  ret <2 x i32> %tmp1
-}
-
-declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>)
-
-define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
-; CHECK: test_addp_v4i32:
-  %tmp1 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
-; CHECK: addp v0.4s, v0.4s, v1.4s
-  ret <4 x i32> %tmp1
-}
-
-
-declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
-
-define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
-; CHECK: test_addp_v2i64:
-        %val = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
-; CHECK: addp v0.2d, v0.2d, v1.2d
-        ret <2 x i64> %val
-}
-
-declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>)
-declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>)
-
-define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
-; CHECK: test_faddp_v2f32:
-        %val = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
-; CHECK: faddp v0.2s, v0.2s, v1.2s
-        ret <2 x float> %val
-}
-
-define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
-; CHECK: test_faddp_v4f32:
-        %val = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
-; CHECK: faddp v0.4s, v0.4s, v1.4s
-        ret <4 x float> %val
-}
-
-define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
-; CHECK: test_faddp_v2f64:
-        %val = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
-; CHECK: faddp v0.2d, v0.2d, v1.2d
-        ret <2 x double> %val
-}
-
-define i32 @test_vaddv.v2i32(<2 x i32> %a) {
-; CHECK-LABEL: test_vaddv.v2i32
-; CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  %1 = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a)
-  ret i32 %1
-}
-
-declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
diff --git a/llvm/test/CodeGen/ARM64/aarch64-neon-copyPhysReg-tuple.ll b/llvm/test/CodeGen/ARM64/aarch64-neon-copyPhysReg-tuple.ll
deleted file mode 100644
index f24392b..0000000
--- a/llvm/test/CodeGen/ARM64/aarch64-neon-copyPhysReg-tuple.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s
-; arm64 has a separate copy due to intrinsics
-
-define <4 x i32> @copyTuple.QPair(i32* %a, i32* %b) {
-; CHECK-LABEL: copyTuple.QPair:
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2, i32 2, i32 2, i32 2>, i64 1, i32* %a)
-  %extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i64 1, i32* %b)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QTriple(i32* %a, i32* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QTriple:
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
-  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, i64 1, i32* %b)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-define <4 x i32> @copyTuple.QQuad(i32* %a, i32* %b, <4 x i32> %c) {
-; CHECK-LABEL: copyTuple.QQuad:
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: mov v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
-; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x{{[0-9]+|sp}}]
-entry:
-  %vld = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %a)
-  %extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld, 0
-  %vld1 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %extract, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c, <4 x i32> %c, i64 1, i32* %b)
-  %vld1.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld1, 0
-  ret <4 x i32> %vld1.fca.0.extract
-}
-
-declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*)
diff --git a/llvm/test/CodeGen/ARM64/arith-saturating.ll b/llvm/test/CodeGen/ARM64/arith-saturating.ll
deleted file mode 100644
index 2d188ff..0000000
--- a/llvm/test/CodeGen/ARM64/arith-saturating.ll
+++ /dev/null
@@ -1,153 +0,0 @@
-; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
-
-define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qadds:
-; CHECK: sqadd s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqadd.i = tail call i32 @llvm.arm64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqadd.i
-}
-
-define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qaddd:
-; CHECK: sqadd d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqadd.i = tail call i64 @llvm.arm64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqadd.i
-}
-
-define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqadds:
-; CHECK: uqadd s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqadd.i = tail call i32 @llvm.arm64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqadd.i
-}
-
-define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqaddd:
-; CHECK: uqadd d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqadd.i = tail call i64 @llvm.arm64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqadd.i
-}
-
-declare i64 @llvm.arm64.neon.uqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.uqadd.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32) nounwind readnone
-
-define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qsubs:
-; CHECK: sqsub s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqsub.i = tail call i32 @llvm.arm64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqsub.i
-}
-
-define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: qsubd:
-; CHECK: sqsub d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqsub.i = tail call i64 @llvm.arm64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqsub.i
-}
-
-define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqsubs:
-; CHECK: uqsub s0, s0, s1
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vecext1 = extractelement <4 x i32> %c, i32 0
-  %vqsub.i = tail call i32 @llvm.arm64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
-  ret i32 %vqsub.i
-}
-
-define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
-; CHECK-LABEL: uqsubd:
-; CHECK: uqsub d0, d0, d1
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vecext1 = extractelement <2 x i64> %c, i32 0
-  %vqsub.i = tail call i64 @llvm.arm64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
-  ret i64 %vqsub.i
-}
-
-declare i64 @llvm.arm64.neon.uqsub.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.uqsub.i32(i32, i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32) nounwind readnone
-
-define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
-; CHECK-LABEL: qabss:
-; CHECK: sqabs s0, s0
-; CHECK: ret
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vqabs.i = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %vecext) nounwind
-  ret i32 %vqabs.i
-}
-
-define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
-; CHECK-LABEL: qabsd:
-; CHECK: sqabs d0, d0
-; CHECK: ret
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqabs.i = tail call i64 @llvm.arm64.neon.sqabs.i64(i64 %vecext) nounwind
-  ret i64 %vqabs.i
-}
-
-define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
-; CHECK-LABEL: qnegs:
-; CHECK: sqneg s0, s0
-; CHECK: ret
-  %vecext = extractelement <4 x i32> %b, i32 0
-  %vqneg.i = tail call i32 @llvm.arm64.neon.sqneg.i32(i32 %vecext) nounwind
-  ret i32 %vqneg.i
-}
-
-define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
-; CHECK-LABEL: qnegd:
-; CHECK: sqneg d0, d0
-; CHECK: ret
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqneg.i = tail call i64 @llvm.arm64.neon.sqneg.i64(i64 %vecext) nounwind
-  ret i64 %vqneg.i
-}
-
-declare i64 @llvm.arm64.neon.sqneg.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqneg.i32(i32) nounwind readnone
-declare i64 @llvm.arm64.neon.sqabs.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.sqabs.i32(i32) nounwind readnone
-
-
-define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vqmovund:
-; CHECK: sqxtun s0, d0
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovun.i = tail call i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
-  ret i32 %vqmovun.i
-}
-
-define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vqmovnd_s:
-; CHECK: sqxtn s0, d0
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
-  ret i32 %vqmovn.i
-}
-
-define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
-; CHECK-LABEL: vqmovnd_u:
-; CHECK: uqxtn s0, d0
-  %vecext = extractelement <2 x i64> %b, i32 0
-  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
-  ret i32 %vqmovn.i
-}
-
-declare i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
-declare i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S b/llvm/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
deleted file mode 100644
index 250732d..0000000
--- a/llvm/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
+++ /dev/null
@@ -1,17 +0,0 @@
-; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o /dev/null %s
-
-        .text
-        .globl _foo
-        .cfi_startproc
-_foo:
-        stp x29, x30, [sp, #-16]!
- .cfi_adjust_cfa_offset 16
-
-        ldp x29, x30, [sp], #16
- .cfi_adjust_cfa_offset -16
-        .cfi_restore x29
-        .cfi_restore x30
-
-        ret
-
-        .cfi_endproc
diff --git a/llvm/test/CodeGen/ARM64/crc32.ll b/llvm/test/CodeGen/ARM64/crc32.ll
deleted file mode 100644
index 5d759dc..0000000
--- a/llvm/test/CodeGen/ARM64/crc32.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -march=arm64 -mattr=+crc -o - %s | FileCheck %s
-
-define i32 @test_crc32b(i32 %cur, i8 %next) {
-; CHECK-LABEL: test_crc32b:
-; CHECK: crc32b w0, w0, w1
-  %bits = zext i8 %next to i32
-  %val = call i32 @llvm.arm64.crc32b(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32h(i32 %cur, i16 %next) {
-; CHECK-LABEL: test_crc32h:
-; CHECK: crc32h w0, w0, w1
-  %bits = zext i16 %next to i32
-  %val = call i32 @llvm.arm64.crc32h(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32w(i32 %cur, i32 %next) {
-; CHECK-LABEL: test_crc32w:
-; CHECK: crc32w w0, w0, w1
-  %val = call i32 @llvm.arm64.crc32w(i32 %cur, i32 %next)
-  ret i32 %val
-}
-
-define i32 @test_crc32x(i32 %cur, i64 %next) {
-; CHECK-LABEL: test_crc32x:
-; CHECK: crc32x w0, w0, x1
-  %val = call i32 @llvm.arm64.crc32x(i32 %cur, i64 %next)
-  ret i32 %val
-}
-
-define i32 @test_crc32cb(i32 %cur, i8 %next) {
-; CHECK-LABEL: test_crc32cb:
-; CHECK: crc32cb w0, w0, w1
-  %bits = zext i8 %next to i32
-  %val = call i32 @llvm.arm64.crc32cb(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32ch(i32 %cur, i16 %next) {
-; CHECK-LABEL: test_crc32ch:
-; CHECK: crc32ch w0, w0, w1
-  %bits = zext i16 %next to i32
-  %val = call i32 @llvm.arm64.crc32ch(i32 %cur, i32 %bits)
-  ret i32 %val
-}
-
-define i32 @test_crc32cw(i32 %cur, i32 %next) {
-; CHECK-LABEL: test_crc32cw:
-; CHECK: crc32cw w0, w0, w1
-  %val = call i32 @llvm.arm64.crc32cw(i32 %cur, i32 %next)
-  ret i32 %val
-}
-
-define i32 @test_crc32cx(i32 %cur, i64 %next) {
-; CHECK-LABEL: test_crc32cx:
-; CHECK: crc32cx w0, w0, x1
-  %val = call i32 @llvm.arm64.crc32cx(i32 %cur, i64 %next)
-  ret i32 %val
-}
-
-declare i32 @llvm.arm64.crc32b(i32, i32)
-declare i32 @llvm.arm64.crc32h(i32, i32)
-declare i32 @llvm.arm64.crc32w(i32, i32)
-declare i32 @llvm.arm64.crc32x(i32, i64)
-
-declare i32 @llvm.arm64.crc32cb(i32, i32)
-declare i32 @llvm.arm64.crc32ch(i32, i32)
-declare i32 @llvm.arm64.crc32cw(i32, i32)
-declare i32 @llvm.arm64.crc32cx(i32, i64)
diff --git a/llvm/test/CodeGen/ARM64/crypto.ll b/llvm/test/CodeGen/ARM64/crypto.ll
deleted file mode 100644
index 0020865..0000000
--- a/llvm/test/CodeGen/ARM64/crypto.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; RUN: llc -march=arm64 -mattr=crypto -arm64-neon-syntax=apple -o - %s | FileCheck %s
-
-declare <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
-declare <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
-declare <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
-declare <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
-
-define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: test_aese:
-; CHECK: aese.16b v0, v1
-  %res = call <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
-; CHECK-LABEL: test_aesd:
-; CHECK: aesd.16b v0, v1
-  %res = call <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_aesmc(<16 x i8> %data) {
-; CHECK-LABEL: test_aesmc:
-; CHECK: aesmc.16b v0, v0
- %res = call <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_aesimc(<16 x i8> %data) {
-; CHECK-LABEL: test_aesimc:
-; CHECK: aesimc.16b v0, v0
- %res = call <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
-  ret <16 x i8> %res
-}
-
-declare <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-declare i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
-declare <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
-declare <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
-
-define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1c:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1c.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1
-define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1c_in_a_row:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
-; CHECK-NOT: fmov
-; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  %extract = extractelement <4 x i32> %res, i32 0
-  %res2 = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
-  ret <4 x i32> %res2
-}
-
-define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1p:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1p.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha1m:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1m.4s q0, [[HASH_E]], v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define i32 @test_sha1h(i32 %hash_e) {
-; CHECK-LABEL: test_sha1h:
-; CHECK: fmov [[HASH_E:s[0-9]+]], w0
-; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
-; CHECK: fmov w0, [[RES]]
-  %res = call i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
-  ret i32 %res
-}
-
-define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
-; CHECK-LABEL: test_sha1su0:
-; CHECK: sha1su0.4s v0, v1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
-; CHECK-LABEL: test_sha1su1:
-; CHECK: sha1su1.4s v0, v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
-  ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-declare <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-declare <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
-
-define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha256h:
-; CHECK: sha256h.4s q0, q1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
-; CHECK-LABEL: test_sha256h2:
-; CHECK: sha256h2.4s q0, q1, v2
-
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
-; CHECK-LABEL: test_sha256su0:
-; CHECK: sha256su0.4s v0, v1
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
-; CHECK-LABEL: test_sha256su1:
-; CHECK: sha256su1.4s v0, v1, v2
-  %res = call <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
-  ret <4 x i32> %res
-}
diff --git a/llvm/test/CodeGen/ARM64/cvt.ll b/llvm/test/CodeGen/ARM64/cvt.ll
deleted file mode 100644
index b55a42f..0000000
--- a/llvm/test/CodeGen/ARM64/cvt.ll
+++ /dev/null
@@ -1,401 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-;
-; Floating-point scalar convert to signed integer (to nearest with ties to away)
-;
-define i32 @fcvtas_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtas_1w1s:
-;CHECK: fcvtas w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtas_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtas_1x1s:
-;CHECK: fcvtas x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtas_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtas_1w1d:
-;CHECK: fcvtas w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtas_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtas_1x1d:
-;CHECK: fcvtas x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtas.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtas.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtas.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtas.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer
-;
-define i32 @fcvtau_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtau_1w1s:
-;CHECK: fcvtau w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtau_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtau_1x1s:
-;CHECK: fcvtau x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtau_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtau_1w1d:
-;CHECK: fcvtau w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtau_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtau_1x1d:
-;CHECK: fcvtau x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtau.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtau.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtau.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtau.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to signed integer (toward -Inf)
-;
-define i32 @fcvtms_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtms_1w1s:
-;CHECK: fcvtms w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtms_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtms_1x1s:
-;CHECK: fcvtms x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtms_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtms_1w1d:
-;CHECK: fcvtms w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtms_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtms_1x1d:
-;CHECK: fcvtms x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtms.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtms.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtms.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtms.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (toward -Inf)
-;
-define i32 @fcvtmu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtmu_1w1s:
-;CHECK: fcvtmu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtmu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtmu_1x1s:
-;CHECK: fcvtmu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtmu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtmu_1w1d:
-;CHECK: fcvtmu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtmu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtmu_1x1d:
-;CHECK: fcvtmu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtmu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtmu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtmu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtmu.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to signed integer (to nearest with ties to even)
-;
-define i32 @fcvtns_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtns_1w1s:
-;CHECK: fcvtns w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtns_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtns_1x1s:
-;CHECK: fcvtns x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtns_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtns_1w1d:
-;CHECK: fcvtns w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtns_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtns_1x1d:
-;CHECK: fcvtns x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtns.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtns.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtns.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtns.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
-;
-define i32 @fcvtnu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtnu_1w1s:
-;CHECK: fcvtnu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtnu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtnu_1x1s:
-;CHECK: fcvtnu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtnu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtnu_1w1d:
-;CHECK: fcvtnu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtnu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtnu_1x1d:
-;CHECK: fcvtnu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtnu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtnu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtnu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtnu.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to signed integer (toward +Inf)
-;
-define i32 @fcvtps_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtps_1w1s:
-;CHECK: fcvtps w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtps_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtps_1x1s:
-;CHECK: fcvtps x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtps_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtps_1w1d:
-;CHECK: fcvtps w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtps_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtps_1x1d:
-;CHECK: fcvtps x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtps.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtps.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtps.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtps.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (toward +Inf)
-;
-define i32 @fcvtpu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtpu_1w1s:
-;CHECK: fcvtpu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtpu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtpu_1x1s:
-;CHECK: fcvtpu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtpu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtpu_1w1d:
-;CHECK: fcvtpu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtpu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtpu_1x1d:
-;CHECK: fcvtpu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtpu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtpu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtpu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtpu.i64.f64(double) nounwind readnone
-
-;
-;  Floating-point scalar convert to signed integer (toward zero)
-;
-define i32 @fcvtzs_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzs_1w1s:
-;CHECK: fcvtzs w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzs_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzs_1x1s:
-;CHECK: fcvtzs x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtzs_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzs_1w1d:
-;CHECK: fcvtzs w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzs_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzs_1x1d:
-;CHECK: fcvtzs x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtzs.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzs.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtzs.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzs.i64.f64(double) nounwind readnone
-
-;
-; Floating-point scalar convert to unsigned integer (toward zero)
-;
-define i32 @fcvtzu_1w1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzu_1w1s:
-;CHECK: fcvtzu w0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f32(float %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzu_1x1s(float %A) nounwind {
-;CHECK-LABEL: fcvtzu_1x1s:
-;CHECK: fcvtzu x0, s0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f32(float %A)
-	ret i64 %tmp3
-}
-
-define i32 @fcvtzu_1w1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzu_1w1d:
-;CHECK: fcvtzu w0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f64(double %A)
-	ret i32 %tmp3
-}
-
-define i64 @fcvtzu_1x1d(double %A) nounwind {
-;CHECK-LABEL: fcvtzu_1x1d:
-;CHECK: fcvtzu x0, d0
-;CHECK-NEXT: ret
-	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f64(double %A)
-	ret i64 %tmp3
-}
-
-declare i32 @llvm.arm64.neon.fcvtzu.i32.f32(float) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzu.i64.f32(float) nounwind readnone
-declare i32 @llvm.arm64.neon.fcvtzu.i32.f64(double) nounwind readnone
-declare i64 @llvm.arm64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/fminv.ll b/llvm/test/CodeGen/ARM64/fminv.ll
deleted file mode 100644
index ca706d8..0000000
--- a/llvm/test/CodeGen/ARM64/fminv.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
-
-define float @test_fminv_v2f32(<2 x float> %in) {
-; CHECK: test_fminv_v2f32:
-; CHECK: fminp s0, v0.2s
-  %min = call float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float> %in)
-  ret float %min
-}
-
-define float @test_fminv_v4f32(<4 x float> %in) {
-; CHECK: test_fminv_v4f32:
-; CHECK: fminv s0, v0.4s
-  %min = call float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float> %in)
-  ret float %min
-}
-
-define double @test_fminv_v2f64(<2 x double> %in) {
-; CHECK: test_fminv_v2f64:
-; CHECK: fminp d0, v0.2d
-  %min = call double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double> %in)
-  ret double %min
-}
-
-declare float @llvm.arm64.neon.fminv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fminv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fminv.f64.v2f64(<2 x double>)
-
-define float @test_fmaxv_v2f32(<2 x float> %in) {
-; CHECK: test_fmaxv_v2f32:
-; CHECK: fmaxp s0, v0.2s
-  %max = call float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float> %in)
-  ret float %max
-}
-
-define float @test_fmaxv_v4f32(<4 x float> %in) {
-; CHECK: test_fmaxv_v4f32:
-; CHECK: fmaxv s0, v0.4s
-  %max = call float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float> %in)
-  ret float %max
-}
-
-define double @test_fmaxv_v2f64(<2 x double> %in) {
-; CHECK: test_fmaxv_v2f64:
-; CHECK: fmaxp d0, v0.2d
-  %max = call double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double> %in)
-  ret double %max
-}
-
-declare float @llvm.arm64.neon.fmaxv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fmaxv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fmaxv.f64.v2f64(<2 x double>)
-
-define float @test_fminnmv_v2f32(<2 x float> %in) {
-; CHECK: test_fminnmv_v2f32:
-; CHECK: fminnmp s0, v0.2s
-  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float> %in)
-  ret float %minnm
-}
-
-define float @test_fminnmv_v4f32(<4 x float> %in) {
-; CHECK: test_fminnmv_v4f32:
-; CHECK: fminnmv s0, v0.4s
-  %minnm = call float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float> %in)
-  ret float %minnm
-}
-
-define double @test_fminnmv_v2f64(<2 x double> %in) {
-; CHECK: test_fminnmv_v2f64:
-; CHECK: fminnmp d0, v0.2d
-  %minnm = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
-  ret double %minnm
-}
-
-declare float @llvm.arm64.neon.fminnmv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fminnmv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
-
-define float @test_fmaxnmv_v2f32(<2 x float> %in) {
-; CHECK: test_fmaxnmv_v2f32:
-; CHECK: fmaxnmp s0, v0.2s
-  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float> %in)
-  ret float %maxnm
-}
-
-define float @test_fmaxnmv_v4f32(<4 x float> %in) {
-; CHECK: test_fmaxnmv_v4f32:
-; CHECK: fmaxnmv s0, v0.4s
-  %maxnm = call float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float> %in)
-  ret float %maxnm
-}
-
-define double @test_fmaxnmv_v2f64(<2 x double> %in) {
-; CHECK: test_fmaxnmv_v2f64:
-; CHECK: fmaxnmp d0, v0.2d
-  %maxnm = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
-  ret double %maxnm
-}
-
-declare float @llvm.arm64.neon.fmaxnmv.f32.v2f32(<2 x float>)
-declare float @llvm.arm64.neon.fmaxnmv.f32.v4f32(<4 x float>)
-declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
diff --git a/llvm/test/CodeGen/ARM64/simd-scalar-to-vector.ll b/llvm/test/CodeGen/ARM64/simd-scalar-to-vector.ll
deleted file mode 100644
index 10e8d6c..0000000
--- a/llvm/test/CodeGen/ARM64/simd-scalar-to-vector.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -mcpu=cyclone | FileCheck %s
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
-
-define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
-; CHECK: uaddlv.16b h0, v0
-; CHECK: rshrn.8b v0, v0, #4
-; CHECK: dup.16b v0, v0[0]
-; CHECK: ret
-
-; CHECK-FAST: uaddlv.16b
-; CHECK-FAST: rshrn.8b
-; CHECK-FAST: dup.16b
-  %tmp = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
-  %tmp1 = trunc i32 %tmp to i16
-  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
-  %tmp3 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
-  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
-  ret <16 x i8> %tmp4
-}
-
-declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
-declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/st1.ll b/llvm/test/CodeGen/ARM64/st1.ll
deleted file mode 100644
index b9aafc6..0000000
--- a/llvm/test/CodeGen/ARM64/st1.ll
+++ /dev/null
@@ -1,676 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
-
-define void @st1lane_16b(<16 x i8> %A, i8* %D) {
-; CHECK-LABEL: st1lane_16b
-; CHECK: st1.b
-  %tmp = extractelement <16 x i8> %A, i32 1
-  store i8 %tmp, i8* %D
-  ret void
-}
-
-define void @st1lane_8h(<8 x i16> %A, i16* %D) {
-; CHECK-LABEL: st1lane_8h
-; CHECK: st1.h
-  %tmp = extractelement <8 x i16> %A, i32 1
-  store i16 %tmp, i16* %D
-  ret void
-}
-
-define void @st1lane_4s(<4 x i32> %A, i32* %D) {
-; CHECK-LABEL: st1lane_4s
-; CHECK: st1.s
-  %tmp = extractelement <4 x i32> %A, i32 1
-  store i32 %tmp, i32* %D
-  ret void
-}
-
-define void @st1lane_4s_float(<4 x float> %A, float* %D) {
-; CHECK-LABEL: st1lane_4s_float
-; CHECK: st1.s
-  %tmp = extractelement <4 x float> %A, i32 1
-  store float %tmp, float* %D
-  ret void
-}
-
-define void @st1lane_2d(<2 x i64> %A, i64* %D) {
-; CHECK-LABEL: st1lane_2d
-; CHECK: st1.d
-  %tmp = extractelement <2 x i64> %A, i32 1
-  store i64 %tmp, i64* %D
-  ret void
-}
-
-define void @st1lane_2d_double(<2 x double> %A, double* %D) {
-; CHECK-LABEL: st1lane_2d_double
-; CHECK: st1.d
-  %tmp = extractelement <2 x double> %A, i32 1
-  store double %tmp, double* %D
-  ret void
-}
-
-define void @st1lane_8b(<8 x i8> %A, i8* %D) {
-; CHECK-LABEL: st1lane_8b
-; CHECK: st1.b
-  %tmp = extractelement <8 x i8> %A, i32 1
-  store i8 %tmp, i8* %D
-  ret void
-}
-
-define void @st1lane_4h(<4 x i16> %A, i16* %D) {
-; CHECK-LABEL: st1lane_4h
-; CHECK: st1.h
-  %tmp = extractelement <4 x i16> %A, i32 1
-  store i16 %tmp, i16* %D
-  ret void
-}
-
-define void @st1lane_2s(<2 x i32> %A, i32* %D) {
-; CHECK-LABEL: st1lane_2s
-; CHECK: st1.s
-  %tmp = extractelement <2 x i32> %A, i32 1
-  store i32 %tmp, i32* %D
-  ret void
-}
-
-define void @st1lane_2s_float(<2 x float> %A, float* %D) {
-; CHECK-LABEL: st1lane_2s_float
-; CHECK: st1.s
-  %tmp = extractelement <2 x float> %A, i32 1
-  store float %tmp, float* %D
-  ret void
-}
-
-define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
-; CHECK-LABEL: st2lane_16b
-; CHECK: st2.b
-  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
-  ret void
-}
-
-define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
-; CHECK-LABEL: st2lane_8h
-; CHECK: st2.h
-  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
-  ret void
-}
-
-define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
-; CHECK-LABEL: st2lane_4s
-; CHECK: st2.s
-  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
-  ret void
-}
-
-define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
-; CHECK-LABEL: st2lane_2d
-; CHECK: st2.d
-  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
-
-define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
-; CHECK-LABEL: st3lane_16b
-; CHECK: st3.b
-  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
-  ret void
-}
-
-define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
-; CHECK-LABEL: st3lane_8h
-; CHECK: st3.h
-  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
-  ret void
-}
-
-define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
-; CHECK-LABEL: st3lane_4s
-; CHECK: st3.s
-  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
-  ret void
-}
-
-define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
-; CHECK-LABEL: st3lane_2d
-; CHECK: st3.d
-  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
-
-define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
-; CHECK-LABEL: st4lane_16b
-; CHECK: st4.b
-  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
-  ret void
-}
-
-define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
-; CHECK-LABEL: st4lane_8h
-; CHECK: st4.h
-  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
-  ret void
-}
-
-define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
-; CHECK-LABEL: st4lane_4s
-; CHECK: st4.s
-  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
-  ret void
-}
-
-define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
-; CHECK-LABEL: st4lane_2d
-; CHECK: st4.d
-  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
-declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
-
-
-define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
-; CHECK-LABEL: st2_8b
-; CHECK st2.8b
-	call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
-	ret void
-}
-
-define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
-; CHECK-LABEL: st3_8b
-; CHECK st3.8b
-	call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
-	ret void
-}
-
-define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
-; CHECK-LABEL: st4_8b
-; CHECK st4.8b
-	call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-
-define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
-; CHECK-LABEL: st2_16b
-; CHECK st2.16b
-	call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
-	ret void
-}
-
-define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
-; CHECK-LABEL: st3_16b
-; CHECK st3.16b
-	call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
-	ret void
-}
-
-define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
-; CHECK-LABEL: st4_16b
-; CHECK st4.16b
-	call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-
-define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
-; CHECK-LABEL: st2_4h
-; CHECK st2.4h
-	call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
-	ret void
-}
-
-define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
-; CHECK-LABEL: st3_4h
-; CHECK st3.4h
-	call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
-	ret void
-}
-
-define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
-; CHECK-LABEL: st4_4h
-; CHECK st4.4h
-	call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-
-define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
-; CHECK-LABEL: st2_8h
-; CHECK st2.8h
-	call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
-	ret void
-}
-
-define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
-; CHECK-LABEL: st3_8h
-; CHECK st3.8h
-	call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
-	ret void
-}
-
-define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
-; CHECK-LABEL: st4_8h
-; CHECK st4.8h
-	call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-
-define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
-; CHECK-LABEL: st2_2s
-; CHECK st2.2s
-	call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
-	ret void
-}
-
-define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
-; CHECK-LABEL: st3_2s
-; CHECK st3.2s
-	call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
-	ret void
-}
-
-define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
-; CHECK-LABEL: st4_2s
-; CHECK st4.2s
-	call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-
-define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
-; CHECK-LABEL: st2_4s
-; CHECK st2.4s
-	call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
-	ret void
-}
-
-define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
-; CHECK-LABEL: st3_4s
-; CHECK st3.4s
-	call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
-	ret void
-}
-
-define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
-; CHECK-LABEL: st4_4s
-; CHECK st4.4s
-	call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-
-define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
-; CHECK-LABEL: st2_1d
-; CHECK st1.2d
-	call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
-	ret void
-}
-
-define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
-; CHECK-LABEL: st3_1d
-; CHECK st1.3d
-	call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
-	ret void
-}
-
-define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
-; CHECK-LABEL: st4_1d
-; CHECK st1.4d
-	call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-
-define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
-; CHECK-LABEL: st2_2d
-; CHECK st2.2d
-	call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
-	ret void
-}
-
-define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
-; CHECK-LABEL: st3_2d
-; CHECK st2.3d
-	call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
-	ret void
-}
-
-define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
-; CHECK-LABEL: st4_2d
-; CHECK st2.4d
-	call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
-	ret void
-}
-
-declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-
-declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
-
-define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
-; CHECK-LABEL: st1_x2_v8i8:
-; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
-  ret void
-}
-
-define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
-; CHECK-LABEL: st1_x2_v4i16:
-; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
-  ret void
-}
-
-define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
-; CHECK-LABEL: st1_x2_v2i32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
-  ret void
-}
-
-define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
-; CHECK-LABEL: st1_x2_v2f32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
-  ret void
-}
-
-define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
-; CHECK-LABEL: st1_x2_v1i64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
-  ret void
-}
-
-define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
-; CHECK-LABEL: st1_x2_v1f64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
-
-define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
-; CHECK-LABEL: st1_x2_v16i8:
-; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
-  ret void
-}
-
-define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
-; CHECK-LABEL: st1_x2_v8i16:
-; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
-  ret void
-}
-
-define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
-; CHECK-LABEL: st1_x2_v4i32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
-  ret void
-}
-
-define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
-; CHECK-LABEL: st1_x2_v4f32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
-  ret void
-}
-
-define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
-; CHECK-LABEL: st1_x2_v2i64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
-  ret void
-}
-
-define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
-; CHECK-LABEL: st1_x2_v2f64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
-
-define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
-; CHECK-LABEL: st1_x3_v8i8:
-; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
-  ret void
-}
-
-define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
-; CHECK-LABEL: st1_x3_v4i16:
-; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
-  ret void
-}
-
-define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
-; CHECK-LABEL: st1_x3_v2i32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
-  ret void
-}
-
-define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
-; CHECK-LABEL: st1_x3_v2f32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
-  ret void
-}
-
-define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
-; CHECK-LABEL: st1_x3_v1i64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
-  ret void
-}
-
-define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
-; CHECK-LABEL: st1_x3_v1f64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
-
-define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
-; CHECK-LABEL: st1_x3_v16i8:
-; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
-  ret void
-}
-
-define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
-; CHECK-LABEL: st1_x3_v8i16:
-; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
-  ret void
-}
-
-define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
-; CHECK-LABEL: st1_x3_v4i32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
-  ret void
-}
-
-define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
-; CHECK-LABEL: st1_x3_v4f32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
-  ret void
-}
-
-define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
-; CHECK-LABEL: st1_x3_v2i64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
-  ret void
-}
-
-define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
-; CHECK-LABEL: st1_x3_v2f64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
-  ret void
-}
-
-
-declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
-
-define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
-; CHECK-LABEL: st1_x4_v8i8:
-; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
-  ret void
-}
-
-define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
-; CHECK-LABEL: st1_x4_v4i16:
-; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
-  ret void
-}
-
-define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
-; CHECK-LABEL: st1_x4_v2i32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
-  ret void
-}
-
-define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
-; CHECK-LABEL: st1_x4_v2f32:
-; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
-  ret void
-}
-
-define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
-; CHECK-LABEL: st1_x4_v1i64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
-  ret void
-}
-
-define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
-; CHECK-LABEL: st1_x4_v1f64:
-; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
-  ret void
-}
-
-declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
-declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
-
-define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
-; CHECK-LABEL: st1_x4_v16i8:
-; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
-  ret void
-}
-
-define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
-; CHECK-LABEL: st1_x4_v8i16:
-; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
-  ret void
-}
-
-define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
-; CHECK-LABEL: st1_x4_v4i32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
-  ret void
-}
-
-define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
-; CHECK-LABEL: st1_x4_v4f32:
-; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
-  ret void
-}
-
-define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
-; CHECK-LABEL: st1_x4_v2i64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
-  ret void
-}
-
-define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
-; CHECK-LABEL: st1_x4_v2f64:
-; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
-  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
-  ret void
-}
diff --git a/llvm/test/CodeGen/ARM64/tbl.ll b/llvm/test/CodeGen/ARM64/tbl.ll
deleted file mode 100644
index e1edd21..0000000
--- a/llvm/test/CodeGen/ARM64/tbl.ll
+++ /dev/null
@@ -1,132 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
-; CHECK: tbl1_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
-; CHECK: tbl1_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
-; CHECK: tbl2_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-; CHECK: tbl2_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK: tbl3_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK: tbl3_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK: tbl4_8b
-; CHECK: tbl.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK: tbl4_16b
-; CHECK: tbl.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
-  ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
-define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
-; CHECK: tbx1_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
-; CHECK: tbx1_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
-; CHECK: tbx2_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
-; CHECK: tbx2_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
-; CHECK: tbx3_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
-; CHECK: tbx3_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
-  ret <16 x i8> %tmp3
-}
-
-define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
-; CHECK: tbx4_8b
-; CHECK: tbx.8b
-  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
-  ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
-; CHECK: tbx4_16b
-; CHECK: tbx.16b
-  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
-  ret <16 x i8> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
diff --git a/llvm/test/CodeGen/ARM64/vaddlv.ll b/llvm/test/CodeGen/ARM64/vaddlv.ll
deleted file mode 100644
index d4d4608..0000000
--- a/llvm/test/CodeGen/ARM64/vaddlv.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
-
-define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
-; CHECK: test_vaddlv_s32
-; CHECK: saddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
-; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddlv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
-  ret i64 %vaddlv.i
-}
-
-define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
-; CHECK: test_vaddlv_u32
-; CHECK: uaddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
-; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
-; CHECK-NEXT: ret
-entry:
-  %vaddlv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
-  ret i64 %vaddlv.i
-}
-
-declare i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
-
-declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
-
diff --git a/llvm/test/CodeGen/ARM64/vcnt.ll b/llvm/test/CodeGen/ARM64/vcnt.ll
deleted file mode 100644
index e00658a..0000000
--- a/llvm/test/CodeGen/ARM64/vcnt.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
-;CHECK-LABEL: cls_8b:
-;CHECK: cls.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8> %tmp1)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
-;CHECK-LABEL: cls_16b:
-;CHECK: cls.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8> %tmp1)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
-;CHECK-LABEL: cls_4h:
-;CHECK: cls.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16> %tmp1)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
-;CHECK-LABEL: cls_8h:
-;CHECK: cls.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16> %tmp1)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: cls_2s:
-;CHECK: cls.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: cls_4s:
-;CHECK: cls.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vcvt_n.ll b/llvm/test/CodeGen/ARM64/vcvt_n.ll
deleted file mode 100644
index 46de557..0000000
--- a/llvm/test/CodeGen/ARM64/vcvt_n.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf32fxpu:
-; CHECK: ucvtf.2s	v0, v0, #9
-; CHECK: ret
-  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
-  ret <2 x float> %vcvt_n1
-}
-
-define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtf32fxps:
-; CHECK: scvtf.2s	v0, v0, #12
-; CHECK: ret
-  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
-  ret <2 x float> %vcvt_n1
-}
-
-define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtqf32fxpu:
-; CHECK: ucvtf.4s	v0, v0, #18
-; CHECK: ret
-  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
-  ret <4 x float> %vcvt_n1
-}
-
-define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
-; CHECK-LABEL: cvtqf32fxps:
-; CHECK: scvtf.4s	v0, v0, #30
-; CHECK: ret
-  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
-  ret <4 x float> %vcvt_n1
-}
-define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
-  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
-  ret <2 x double> %vcvt_n1
-}
-
-define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
-  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
-  ret <2 x double> %vcvt_n1
-}
-
-declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vcvtxd_f32_f64.ll b/llvm/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
deleted file mode 100644
index bbe8f0b..0000000
--- a/llvm/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: llc < %s -march=arm64 | FileCheck %s
-
-define float @fcvtxn(double %a) {
-; CHECK-LABEL: fcvtxn:
-; CHECK: fcvtxn s0, d0
-; CHECK-NEXT: ret
-  %vcvtxd.i = tail call float @llvm.arm64.sisd.fcvtxn(double %a) nounwind
-  ret float %vcvtxd.i
-}
-
-declare float @llvm.arm64.sisd.fcvtxn(double) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vhadd.ll b/llvm/test/CodeGen/ARM64/vhadd.ll
deleted file mode 100644
index aed7681..0000000
--- a/llvm/test/CodeGen/ARM64/vhadd.ll
+++ /dev/null
@@ -1,249 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: shadd8b:
-;CHECK: shadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: shadd16b:
-;CHECK: shadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: shadd4h:
-;CHECK: shadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: shadd8h:
-;CHECK: shadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: shadd2s:
-;CHECK: shadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: shadd4s:
-;CHECK: shadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uhadd8b:
-;CHECK: uhadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uhadd16b:
-;CHECK: uhadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uhadd4h:
-;CHECK: uhadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uhadd8h:
-;CHECK: uhadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uhadd2s:
-;CHECK: uhadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uhadd4s:
-;CHECK: uhadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: srhadd8b:
-;CHECK: srhadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: srhadd16b:
-;CHECK: srhadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: srhadd4h:
-;CHECK: srhadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: srhadd8h:
-;CHECK: srhadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: srhadd2s:
-;CHECK: srhadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: srhadd4s:
-;CHECK: srhadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: urhadd8b:
-;CHECK: urhadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: urhadd16b:
-;CHECK: urhadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: urhadd4h:
-;CHECK: urhadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: urhadd8h:
-;CHECK: urhadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: urhadd2s:
-;CHECK: urhadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: urhadd4s:
-;CHECK: urhadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vhsub.ll b/llvm/test/CodeGen/ARM64/vhsub.ll
deleted file mode 100644
index 85df4d4..0000000
--- a/llvm/test/CodeGen/ARM64/vhsub.ll
+++ /dev/null
@@ -1,125 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: shsub8b:
-;CHECK: shsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: shsub16b:
-;CHECK: shsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: shsub4h:
-;CHECK: shsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: shsub8h:
-;CHECK: shsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: shsub2s:
-;CHECK: shsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: shsub4s:
-;CHECK: shsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uhsub8b:
-;CHECK: uhsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uhsub16b:
-;CHECK: uhsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uhsub4h:
-;CHECK: uhsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uhsub8h:
-;CHECK: uhsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uhsub2s:
-;CHECK: uhsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uhsub4s:
-;CHECK: uhsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vmax.ll b/llvm/test/CodeGen/ARM64/vmax.ll
deleted file mode 100644
index b2426f3..0000000
--- a/llvm/test/CodeGen/ARM64/vmax.ll
+++ /dev/null
@@ -1,679 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smax_8b:
-;CHECK: smax.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: smax_16b:
-;CHECK: smax.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smax_4h:
-;CHECK: smax.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: smax_8h:
-;CHECK: smax.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smax_2s:
-;CHECK: smax.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: smax_4s:
-;CHECK: smax.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umax_8b:
-;CHECK: umax.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: umax_16b:
-;CHECK: umax.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umax_4h:
-;CHECK: umax.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: umax_8h:
-;CHECK: umax.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umax_2s:
-;CHECK: umax.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: umax_4s:
-;CHECK: umax.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smin_8b:
-;CHECK: smin.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: smin_16b:
-;CHECK: smin.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smin_4h:
-;CHECK: smin.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: smin_8h:
-;CHECK: smin.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smin_2s:
-;CHECK: smin.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: smin_4s:
-;CHECK: smin.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umin_8b:
-;CHECK: umin.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: umin_16b:
-;CHECK: umin.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umin_4h:
-;CHECK: umin.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: umin_8h:
-;CHECK: umin.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umin_2s:
-;CHECK: umin.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: umin_4s:
-;CHECK: umin.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: smaxp_8b:
-;CHECK: smaxp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: smaxp_16b:
-;CHECK: smaxp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: smaxp_4h:
-;CHECK: smaxp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: smaxp_8h:
-;CHECK: smaxp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: smaxp_2s:
-;CHECK: smaxp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: smaxp_4s:
-;CHECK: smaxp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: umaxp_8b:
-;CHECK: umaxp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: umaxp_16b:
-;CHECK: umaxp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: umaxp_4h:
-;CHECK: umaxp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: umaxp_8h:
-;CHECK: umaxp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: umaxp_2s:
-;CHECK: umaxp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: umaxp_4s:
-;CHECK: umaxp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sminp_8b:
-;CHECK: sminp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sminp_16b:
-;CHECK: sminp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sminp_4h:
-;CHECK: sminp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sminp_8h:
-;CHECK: sminp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sminp_2s:
-;CHECK: sminp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sminp_4s:
-;CHECK: sminp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uminp_8b:
-;CHECK: uminp.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uminp_16b:
-;CHECK: uminp.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uminp_4h:
-;CHECK: uminp.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uminp_8h:
-;CHECK: uminp.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uminp_2s:
-;CHECK: uminp.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uminp_4s:
-;CHECK: uminp.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-declare <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-
-define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmax_2s:
-;CHECK: fmax.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmax_4s:
-;CHECK: fmax.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmax_2d:
-;CHECK: fmax.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxp_2s:
-;CHECK: fmaxp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxp_4s:
-;CHECK: fmaxp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmaxp_2d:
-;CHECK: fmaxp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmin_2s:
-;CHECK: fmin.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmin_4s:
-;CHECK: fmin.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmin_2d:
-;CHECK: fmin.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fminp_2s:
-;CHECK: fminp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fminp_4s:
-;CHECK: fminp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fminp_2d:
-;CHECK: fminp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fminnmp_2s:
-;CHECK: fminnmp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fminnmp_4s:
-;CHECK: fminnmp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fminnmp_2d:
-;CHECK: fminnmp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxnmp_2s:
-;CHECK: fmaxnmp.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: fmaxnmp_4s:
-;CHECK: fmaxnmp.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: fmaxnmp_2d:
-;CHECK: fmaxnmp.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vminmaxnm.ll b/llvm/test/CodeGen/ARM64/vminmaxnm.ll
deleted file mode 100644
index 6286407..0000000
--- a/llvm/test/CodeGen/ARM64/vminmaxnm.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2s	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
-  ret <2 x float> %vmaxnm2.i
-}
-
-define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.4s	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
-  ret <4 x float> %vmaxnm2.i
-}
-
-define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fmaxnm.2d	v0, v0, v1
-; CHECK: ret
-  %vmaxnm2.i = tail call <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
-  ret <2 x double> %vmaxnm2.i
-}
-
-define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.2s	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
-  ret <2 x float> %vminnm2.i
-}
-
-define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
-; CHECK: fminnm.4s	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
-  ret <4 x float> %vminnm2.i
-}
-
-define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
-; CHECK: fminnm.2d	v0, v0, v1
-; CHECK: ret
-  %vminnm2.i = tail call <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
-  ret <2 x double> %vminnm2.i
-}
-
-declare <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
-
-
-define double @test_fmaxnmv(<2 x double> %in) {
-; CHECK-LABEL: test_fmaxnmv:
-; CHECK: fmaxnmp.2d d0, v0
-  %max = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
-  ret double %max
-}
-
-define double @test_fminnmv(<2 x double> %in) {
-; CHECK-LABEL: test_fminnmv:
-; CHECK: fminnmp.2d d0, v0
-  %min = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
-  ret double %min
-}
-
-declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
-declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/llvm/test/CodeGen/ARM64/vqadd.ll b/llvm/test/CodeGen/ARM64/vqadd.ll
deleted file mode 100644
index 0b7f7e5..0000000
--- a/llvm/test/CodeGen/ARM64/vqadd.ll
+++ /dev/null
@@ -1,332 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqadd8b:
-;CHECK: sqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqadd4h:
-;CHECK: sqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqadd2s:
-;CHECK: sqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqadd8b:
-;CHECK: uqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqadd4h:
-;CHECK: uqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqadd2s:
-;CHECK: uqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqadd16b:
-;CHECK: sqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqadd8h:
-;CHECK: sqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqadd4s:
-;CHECK: sqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqadd2d:
-;CHECK: sqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqadd16b:
-;CHECK: uqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqadd8h:
-;CHECK: uqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqadd4s:
-;CHECK: uqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqadd2d:
-;CHECK: uqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: usqadd8b:
-;CHECK: usqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: usqadd4h:
-;CHECK: usqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: usqadd2s:
-;CHECK: usqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: usqadd16b:
-;CHECK: usqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: usqadd8h:
-;CHECK: usqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: usqadd4s:
-;CHECK: usqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: usqadd2d:
-;CHECK: usqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define i64 @usqadd_d(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: usqadd_d:
-; CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call i64 @llvm.arm64.neon.usqadd.i64(i64 %l, i64 %r)
-  ret i64 %sum
-}
-
-define i32 @usqadd_s(i32 %l, i32 %r) nounwind {
-; CHECK-LABEL: usqadd_s:
-; CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
-  %sum = call i32 @llvm.arm64.neon.usqadd.i32(i32 %l, i32 %r)
-  ret i32 %sum
-}
-
-declare <8 x i8>  @llvm.arm64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.usqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.usqadd.i32(i32, i32) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: suqadd8b:
-;CHECK: suqadd.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: suqadd4h:
-;CHECK: suqadd.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: suqadd2s:
-;CHECK: suqadd.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: suqadd16b:
-;CHECK: suqadd.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: suqadd8h:
-;CHECK: suqadd.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: suqadd4s:
-;CHECK: suqadd.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: suqadd2d:
-;CHECK: suqadd.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <1 x i64> @suqadd_1d(<1 x i64> %l, <1 x i64> %r) nounwind {
-; CHECK-LABEL: suqadd_1d:
-; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
-  ret <1 x i64> %sum
-}
-
-define i64 @suqadd_d(i64 %l, i64 %r) nounwind {
-; CHECK-LABEL: suqadd_d:
-; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
-  %sum = call i64 @llvm.arm64.neon.suqadd.i64(i64 %l, i64 %r)
-  ret i64 %sum
-}
-
-define i32 @suqadd_s(i32 %l, i32 %r) nounwind {
-; CHECK-LABEL: suqadd_s:
-; CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
-  %sum = call i32 @llvm.arm64.neon.suqadd.i32(i32 %l, i32 %r)
-  ret i32 %sum
-}
-
-declare <8 x i8>  @llvm.arm64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-declare i64 @llvm.arm64.neon.suqadd.i64(i64, i64) nounwind readnone
-declare i32 @llvm.arm64.neon.suqadd.i32(i32, i32) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vqsub.ll b/llvm/test/CodeGen/ARM64/vqsub.ll
deleted file mode 100644
index 0afeb68..0000000
--- a/llvm/test/CodeGen/ARM64/vqsub.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: sqsub8b:
-;CHECK: sqsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: sqsub4h:
-;CHECK: sqsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: sqsub2s:
-;CHECK: sqsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK-LABEL: uqsub8b:
-;CHECK: uqsub.8b
-	%tmp1 = load <8 x i8>* %A
-	%tmp2 = load <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
-	ret <8 x i8> %tmp3
-}
-
-define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK-LABEL: uqsub4h:
-;CHECK: uqsub.4h
-	%tmp1 = load <4 x i16>* %A
-	%tmp2 = load <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
-	ret <4 x i16> %tmp3
-}
-
-define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
-;CHECK-LABEL: uqsub2s:
-;CHECK: uqsub.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp2 = load <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
-	ret <2 x i32> %tmp3
-}
-
-define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: sqsub16b:
-;CHECK: sqsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: sqsub8h:
-;CHECK: sqsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: sqsub4s:
-;CHECK: sqsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: sqsub2d:
-;CHECK: sqsub.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
-;CHECK-LABEL: uqsub16b:
-;CHECK: uqsub.16b
-	%tmp1 = load <16 x i8>* %A
-	%tmp2 = load <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
-	ret <16 x i8> %tmp3
-}
-
-define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: uqsub8h:
-;CHECK: uqsub.8h
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
-	ret <8 x i16> %tmp3
-}
-
-define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
-;CHECK-LABEL: uqsub4s:
-;CHECK: uqsub.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp2 = load <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
-	ret <4 x i32> %tmp3
-}
-
-define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
-;CHECK-LABEL: uqsub2d:
-;CHECK: uqsub.2d
-	%tmp1 = load <2 x i64>* %A
-	%tmp2 = load <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
-	ret <2 x i64> %tmp3
-}
-
-declare <8 x i8>  @llvm.arm64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <8 x i8>  @llvm.arm64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
-
-declare <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM64/vsqrt.ll b/llvm/test/CodeGen/ARM64/vsqrt.ll
deleted file mode 100644
index 094d704..0000000
--- a/llvm/test/CodeGen/ARM64/vsqrt.ll
+++ /dev/null
@@ -1,232 +0,0 @@
-; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
-
-define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: frecps_2s:
-;CHECK: frecps.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: frecps_4s:
-;CHECK: frecps.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: frecps_2d:
-;CHECK: frecps.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
-;CHECK-LABEL: frsqrts_2s:
-;CHECK: frsqrts.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp2 = load <2 x float>* %B
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
-;CHECK-LABEL: frsqrts_4s:
-;CHECK: frsqrts.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp2 = load <4 x float>* %B
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
-;CHECK-LABEL: frsqrts_2d:
-;CHECK: frsqrts.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp2 = load <2 x double>* %B
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
-	ret <2 x double> %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
-
-define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
-;CHECK-LABEL: frecpe_2s:
-;CHECK: frecpe.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float> %tmp1)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
-;CHECK-LABEL: frecpe_4s:
-;CHECK: frecpe.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float> %tmp1)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
-;CHECK-LABEL: frecpe_2d:
-;CHECK: frecpe.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double> %tmp1)
-	ret <2 x double> %tmp3
-}
-
-define float @frecpe_s(float* %A) nounwind {
-;CHECK-LABEL: frecpe_s:
-;CHECK: frecpe s0, {{s[0-9]+}}
-  %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frecpe.f32(float %tmp1)
-  ret float %tmp3
-}
-
-define double @frecpe_d(double* %A) nounwind {
-;CHECK-LABEL: frecpe_d:
-;CHECK: frecpe d0, {{d[0-9]+}}
-  %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frecpe.f64(double %tmp1)
-  ret double %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
-declare float @llvm.arm64.neon.frecpe.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frecpe.f64(double) nounwind readnone
-
-define float @frecpx_s(float* %A) nounwind {
-;CHECK-LABEL: frecpx_s:
-;CHECK: frecpx s0, {{s[0-9]+}}
-  %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frecpx.f32(float %tmp1)
-  ret float %tmp3
-}
-
-define double @frecpx_d(double* %A) nounwind {
-;CHECK-LABEL: frecpx_d:
-;CHECK: frecpx d0, {{d[0-9]+}}
-  %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frecpx.f64(double %tmp1)
-  ret double %tmp3
-}
-
-declare float @llvm.arm64.neon.frecpx.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frecpx.f64(double) nounwind readnone
-
-define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
-;CHECK-LABEL: frsqrte_2s:
-;CHECK: frsqrte.2s
-	%tmp1 = load <2 x float>* %A
-	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float> %tmp1)
-	ret <2 x float> %tmp3
-}
-
-define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
-;CHECK-LABEL: frsqrte_4s:
-;CHECK: frsqrte.4s
-	%tmp1 = load <4 x float>* %A
-	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float> %tmp1)
-	ret <4 x float> %tmp3
-}
-
-define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
-;CHECK-LABEL: frsqrte_2d:
-;CHECK: frsqrte.2d
-	%tmp1 = load <2 x double>* %A
-	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double> %tmp1)
-	ret <2 x double> %tmp3
-}
-
-define float @frsqrte_s(float* %A) nounwind {
-;CHECK-LABEL: frsqrte_s:
-;CHECK: frsqrte s0, {{s[0-9]+}}
-  %tmp1 = load float* %A
-  %tmp3 = call float @llvm.arm64.neon.frsqrte.f32(float %tmp1)
-  ret float %tmp3
-}
-
-define double @frsqrte_d(double* %A) nounwind {
-;CHECK-LABEL: frsqrte_d:
-;CHECK: frsqrte d0, {{d[0-9]+}}
-  %tmp1 = load double* %A
-  %tmp3 = call double @llvm.arm64.neon.frsqrte.f64(double %tmp1)
-  ret double %tmp3
-}
-
-declare <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
-declare <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
-declare float @llvm.arm64.neon.frsqrte.f32(float) nounwind readnone
-declare double @llvm.arm64.neon.frsqrte.f64(double) nounwind readnone
-
-define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: urecpe_2s:
-;CHECK: urecpe.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: urecpe_4s:
-;CHECK: urecpe.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
-
-define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
-;CHECK-LABEL: ursqrte_2s:
-;CHECK: ursqrte.2s
-	%tmp1 = load <2 x i32>* %A
-	%tmp3 = call <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
-	ret <2 x i32> %tmp3
-}
-
-define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
-;CHECK-LABEL: ursqrte_4s:
-;CHECK: ursqrte.4s
-	%tmp1 = load <4 x i32>* %A
-	%tmp3 = call <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
-	ret <4 x i32> %tmp3
-}
-
-declare <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
-declare <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
-
-define float @f1(float %a, float %b) nounwind readnone optsize ssp {
-; CHECK-LABEL: f1:
-; CHECK: frsqrts s0, s0, s1
-; CHECK-NEXT: ret
-  %vrsqrtss.i = tail call float @llvm.arm64.neon.frsqrts.f32(float %a, float %b) nounwind
-  ret float %vrsqrtss.i
-}
-
-define double @f2(double %a, double %b) nounwind readnone optsize ssp {
-; CHECK-LABEL: f2:
-; CHECK: frsqrts d0, d0, d1
-; CHECK-NEXT: ret
-  %vrsqrtsd.i = tail call double @llvm.arm64.neon.frsqrts.f64(double %a, double %b) nounwind
-  ret double %vrsqrtsd.i
-}
-
-declare double @llvm.arm64.neon.frsqrts.f64(double, double) nounwind readnone
-declare float @llvm.arm64.neon.frsqrts.f32(float, float) nounwind readnone