AArch64: add initial NEON support

Patch by Ana Pazos.

- Completed implementation of instruction formats:
AdvSIMD three same
AdvSIMD modified immediate
AdvSIMD scalar pairwise

- Completed implementation of instruction classes
(some of the instructions in these classes
belong to yet unfinished instruction formats):
Vector Arithmetic
Vector Immediate
Vector Pairwise Arithmetic

- Initial implementation of instruction formats:
AdvSIMD scalar two-reg misc
AdvSIMD scalar three same

- Intial implementation of instruction class:
Scalar Arithmetic

- Initial clang changes to support arm v8 intrinsics.
Note: no clang changes for scalar intrinsics function name mangling yet.

- Comprehensive test cases for added instructions
To verify auto codegen, encoding, decoding, diagnosis, intrinsics.

llvm-svn: 187567
diff --git a/llvm/test/CodeGen/AArch64/complex-copy-noneon.ll b/llvm/test/CodeGen/AArch64/complex-copy-noneon.ll
new file mode 100644
index 0000000..4ae5478
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-copy-noneon.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s
+
+; The DAG combiner decided to use a vector load/store for this struct copy
+; previously. This probably shouldn't happen without NEON, but the most
+; important thing is that it compiles.
+
+define void @store_combine() nounwind {
+  %src = alloca { double, double }, align 8
+  %dst = alloca { double, double }, align 8
+
+  %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0
+  %src.real = load double* %src.realp
+  %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1
+  %src.imag = load double* %src.imagp
+
+  %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1
+  store double %src.real, double* %dst.realp
+  store double %src.imag, double* %dst.imagp
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/inline-asm-constraints.ll b/llvm/test/CodeGen/AArch64/inline-asm-constraints.ll
index cfa06a4..18a3b37 100644
--- a/llvm/test/CodeGen/AArch64/inline-asm-constraints.ll
+++ b/llvm/test/CodeGen/AArch64/inline-asm-constraints.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define i64 @test_inline_constraint_r(i64 %base, i32 %offset) {
 ; CHECK-LABEL: test_inline_constraint_r:
@@ -44,6 +44,26 @@
 
 @dump = global fp128 zeroinitializer
 
+define void @test_inline_constraint_w(<8 x i8> %vec64, <4 x float> %vec128, half %hlf, float %flt, double %dbl, fp128 %quad) {
+; CHECK: test_inline_constraint_w:
+  call <8 x i8> asm sideeffect "add $0.8b, $1.8b, $1.8b", "=w,w"(<8 x i8> %vec64)
+  call <8 x i8> asm sideeffect "fadd $0.4s, $1.4s, $1.4s", "=w,w"(<4 x float> %vec128)
+; CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+
+  ; Arguably semantically dodgy to output "vN", but it's what GCC does
+  ; so purely for compatibility we want vector registers to be output.
+  call float asm sideeffect "fcvt ${0:s}, ${1:h}", "=w,w"(half undef)
+  call float asm sideeffect "fadd $0.2s, $0.2s, $0.2s", "=w,w"(float %flt)
+  call double asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(double %dbl)
+  call fp128 asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(fp128 %quad)
+; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}}
+; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+  ret void
+}
+
 define void @test_inline_constraint_I() {
 ; CHECK-LABEL: test_inline_constraint_I:
   call void asm sideeffect "add x0, x0, $0", "I"(i32 0)
diff --git a/llvm/test/CodeGen/AArch64/neon-aba-abd.ll b/llvm/test/CodeGen/AArch64/neon-aba-abd.ll
new file mode 100644
index 0000000..b423666
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-aba-abd.ll
@@ -0,0 +1,226 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uabd_v8i8:
+  %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uabd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %abd
+}
+
+define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uaba_v8i8:
+  %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %aba = add <8 x i8> %lhs, %abd
+; CHECK: uaba v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %aba
+}
+
+define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sabd_v8i8:
+  %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sabd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %abd
+}
+
+define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_saba_v8i8:
+  %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+  %aba = add <8 x i8> %lhs, %abd
+; CHECK: saba v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %aba
+}
+
+declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uabd_v16i8:
+  %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uabd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %abd
+}
+
+define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uaba_v16i8:
+  %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %aba = add <16 x i8> %lhs, %abd
+; CHECK: uaba v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %aba
+}
+
+define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sabd_v16i8:
+  %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sabd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %abd
+}
+
+define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_saba_v16i8:
+  %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+  %aba = add <16 x i8> %lhs, %abd
+; CHECK: saba v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %aba
+}
+
+declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uabd_v4i16:
+  %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uabd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %abd
+}
+
+define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uaba_v4i16:
+  %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %aba = add <4 x i16> %lhs, %abd
+; CHECK: uaba v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %aba
+}
+
+define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sabd_v4i16:
+  %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sabd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %abd
+}
+
+define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_saba_v4i16:
+  %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+  %aba = add <4 x i16> %lhs, %abd
+; CHECK: saba v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %aba
+}
+
+declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uabd_v8i16:
+  %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uabd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %abd
+}
+
+define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uaba_v8i16:
+  %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %aba = add <8 x i16> %lhs, %abd
+; CHECK: uaba v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %aba
+}
+
+define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sabd_v8i16:
+  %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sabd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %abd
+}
+
+define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_saba_v8i16:
+  %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+  %aba = add <8 x i16> %lhs, %abd
+; CHECK: saba v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %aba
+}
+
+declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uabd_v2i32:
+  %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uabd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %abd
+}
+
+define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uaba_v2i32:
+  %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %aba = add <2 x i32> %lhs, %abd
+; CHECK: uaba v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %aba
+}
+
+define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sabd_v2i32:
+  %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sabd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %abd
+}
+
+define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_saba_v2i32:
+  %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+  %aba = add <2 x i32> %lhs, %abd
+; CHECK: saba v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %aba
+}
+
+declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uabd_v4i32:
+  %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uabd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %abd
+}
+
+define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uaba_v4i32:
+  %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %aba = add <4 x i32> %lhs, %abd
+; CHECK: uaba v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %aba
+}
+
+define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sabd_v4i32:
+  %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sabd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %abd
+}
+
+define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_saba_v4i32:
+  %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+  %aba = add <4 x i32> %lhs, %abd
+; CHECK: saba v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %aba
+}
+
+declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>)
+
+define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fabd_v2f32:
+  %abd = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fabd v0.2s, v0.2s, v1.2s
+  ret <2 x float> %abd
+}
+
+declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>)
+
+define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fabd_v4f32:
+  %abd = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fabd v0.4s, v0.4s, v1.4s
+  ret <4 x float> %abd
+}
+
+declare <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double>, <2 x double>)
+
+define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fabd_v2f64:
+  %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fabd v0.2d, v0.2d, v1.2d
+  ret <2 x double> %abd
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/neon-add-pairwise.ll b/llvm/test/CodeGen/AArch64/neon-add-pairwise.ll
new file mode 100644
index 0000000..1abfed3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-add-pairwise.ll
@@ -0,0 +1,92 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_addp_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: addp v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_addp_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: addp v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_addp_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: addp v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_addp_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: addp v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_addp_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: addp v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_addp_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: addp v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+
+declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_addp_v2i64:
+        %val = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: addp v0.2d, v0.2d, v1.2d
+        ret <2 x i64> %val
+}
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_faddp_v2f32:
+        %val = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: faddp v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_faddp_v4f32:
+        %val = call <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: faddp v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_faddp_v2f64:
+        %val = call <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: faddp v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-add-sub.ll b/llvm/test/CodeGen/AArch64/neon-add-sub.ll
new file mode 100644
index 0000000..65ec8a2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-add-sub.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: add {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp3 = add <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: add {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp3 = add <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: add {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+	%tmp3 = add <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: add {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+	%tmp3 = add <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: add {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp3 = add <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: add {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp3 = add <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+	%tmp3 = add <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fadd {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp3 = fadd <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fadd {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp3 = fadd <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+	%tmp3 = fadd <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: sub {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp3 = sub <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: sub {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp3 = sub <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: sub {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+	%tmp3 = sub <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: sub {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+	%tmp3 = sub <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: sub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp3 = sub <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: sub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp3 = sub <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+	%tmp3 = sub <2 x i64> %A, %B;
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fsub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp3 = fsub <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fsub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp3 = fsub <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+	%tmp3 = fsub <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+	%tmp3 = add <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
+define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+	%tmp3 = sub <1 x i64> %A, %B;
+	ret <1 x i64> %tmp3
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast.ll b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
new file mode 100644
index 0000000..f9ec704
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-bitcast.ll
@@ -0,0 +1,574 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+
+; From <8 x i8>
+
+define <1 x i64> @test_v8i8_to_v1i64(<8 x i8> %in) nounwind {
+; CHECK: test_v8i8_to_v1i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i8> %in to <1 x i64>
+  ret <1 x i64> %val
+}
+
+define <2 x i32> @test_v8i8_to_v2i32(<8 x i8> %in) nounwind {
+; CHECK: test_v8i8_to_v2i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i8> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x float> @test_v8i8_to_v1f32(<8 x i8> %in) nounwind{
+; CHECK: test_v8i8_to_v1f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i8> %in to <2 x float>
+  ret <2 x float> %val
+}
+
+define <4 x i16> @test_v8i8_to_v4i16(<8 x i8> %in) nounwind{
+; CHECK: test_v8i8_to_v4i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i8> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <8 x i8> @test_v8i8_to_v8i8(<8 x i8> %in) nounwind{
+; CHECK: test_v8i8_to_v8i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i8> %in to <8 x i8>
+  ret <8 x i8> %val
+}
+
+; From <4 x i16>
+
+define <1 x i64> @test_v4i16_to_v1i64(<4 x i16> %in) nounwind {
+; CHECK: test_v4i16_to_v1i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i16> %in to <1 x i64>
+  ret <1 x i64> %val
+}
+
+define <2 x i32> @test_v4i16_to_v2i32(<4 x i16> %in) nounwind {
+; CHECK: test_v4i16_to_v2i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i16> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x float> @test_v4i16_to_v1f32(<4 x i16> %in) nounwind{
+; CHECK: test_v4i16_to_v1f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i16> %in to <2 x float>
+  ret <2 x float> %val
+}
+
+define <4 x i16> @test_v4i16_to_v4i16(<4 x i16> %in) nounwind{
+; CHECK: test_v4i16_to_v4i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i16> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <8 x i8> @test_v4i16_to_v8i8(<4 x i16> %in) nounwind{
+; CHECK: test_v4i16_to_v8i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i16> %in to <8 x i8>
+  ret <8 x i8> %val
+}
+
+; From <2 x i32>
+
+define <1 x i64> @test_v2i32_to_v1i64(<2 x i32> %in) nounwind {
+; CHECK: test_v2i32_to_v1i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i32> %in to <1 x i64>
+  ret <1 x i64> %val
+}
+
+define <2 x i32> @test_v2i32_to_v2i32(<2 x i32> %in) nounwind {
+; CHECK: test_v2i32_to_v2i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i32> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x float> @test_v2i32_to_v1f32(<2 x i32> %in) nounwind{
+; CHECK: test_v2i32_to_v1f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i32> %in to <2 x float>
+  ret <2 x float> %val
+}
+
+define <4 x i16> @test_v2i32_to_v4i16(<2 x i32> %in) nounwind{
+; CHECK: test_v2i32_to_v4i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i32> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <8 x i8> @test_v2i32_to_v8i8(<2 x i32> %in) nounwind{
+; CHECK: test_v2i32_to_v8i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i32> %in to <8 x i8>
+  ret <8 x i8> %val
+}
+
+; From <2 x float>
+
+define <1 x i64> @test_v2f32_to_v1i64(<2 x float> %in) nounwind {
+; CHECK: test_v2f32_to_v1i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x float> %in to <1 x i64>
+  ret <1 x i64> %val
+}
+
+define <2 x i32> @test_v2f32_to_v2i32(<2 x float> %in) nounwind {
+; CHECK: test_v2f32_to_v2i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x float> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x float> @test_v2f32_to_v2f32(<2 x float> %in) nounwind{
+; CHECK: test_v2f32_to_v2f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x float> %in to <2 x float>
+  ret <2 x float> %val
+}
+
+define <4 x i16> @test_v2f32_to_v4i16(<2 x float> %in) nounwind{
+; CHECK: test_v2f32_to_v4i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x float> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <8 x i8> @test_v2f32_to_v8i8(<2 x float> %in) nounwind{
+; CHECK: test_v2f32_to_v8i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x float> %in to <8 x i8>
+  ret <8 x i8> %val
+}
+
+; From <1 x i64>
+
+define <1 x i64> @test_v1i64_to_v1i64(<1 x i64> %in) nounwind {
+; CHECK: test_v1i64_to_v1i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <1 x i64> %in to <1 x i64>
+  ret <1 x i64> %val
+}
+
+define <2 x i32> @test_v1i64_to_v2i32(<1 x i64> %in) nounwind {
+; CHECK: test_v1i64_to_v2i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <1 x i64> %in to <2 x i32>
+  ret <2 x i32> %val
+}
+
+define <2 x float> @test_v1i64_to_v2f32(<1 x i64> %in) nounwind{
+; CHECK: test_v1i64_to_v2f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <1 x i64> %in to <2 x float>
+  ret <2 x float> %val
+}
+
+define <4 x i16> @test_v1i64_to_v4i16(<1 x i64> %in) nounwind{
+; CHECK: test_v1i64_to_v4i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <1 x i64> %in to <4 x i16>
+  ret <4 x i16> %val
+}
+
+define <8 x i8> @test_v1i64_to_v8i8(<1 x i64> %in) nounwind{
+; CHECK: test_v1i64_to_v8i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <1 x i64> %in to <8 x i8>
+  ret <8 x i8> %val
+}
+
+
+; From <16 x i8>
+
+define <2 x double> @test_v16i8_to_v2f64(<16 x i8> %in) nounwind {
+; CHECK: test_v16i8_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <16 x i8> %in to <2 x double>
+  ret <2 x double> %val
+}
+
+define <2 x i64> @test_v16i8_to_v2i64(<16 x i8> %in) nounwind {
+; CHECK: test_v16i8_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <16 x i8> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v16i8_to_v4i32(<16 x i8> %in) nounwind {
+; CHECK: test_v16i8_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <16 x i8> %in to <4 x i32>
+  ret <4 x i32> %val
+}
+
+define <4 x float> @test_v16i8_to_v2f32(<16 x i8> %in) nounwind{
+; CHECK: test_v16i8_to_v2f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <16 x i8> %in to <4 x float>
+  ret <4 x float> %val
+}
+
+define <8 x i16> @test_v16i8_to_v8i16(<16 x i8> %in) nounwind{
+; CHECK: test_v16i8_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <16 x i8> %in to <8 x i16>
+  ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v16i8_to_v16i8(<16 x i8> %in) nounwind{
+; CHECK: test_v16i8_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <16 x i8> %in to <16 x i8>
+  ret <16 x i8> %val
+}
+
+; From <8 x i16>
+
+define <2 x double> @test_v8i16_to_v2f64(<8 x i16> %in) nounwind {
+; CHECK: test_v8i16_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i16> %in to <2 x double>
+  ret <2 x double> %val
+}
+
+define <2 x i64> @test_v8i16_to_v2i64(<8 x i16> %in) nounwind {
+; CHECK: test_v8i16_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i16> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v8i16_to_v4i32(<8 x i16> %in) nounwind {
+; CHECK: test_v8i16_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i16> %in to <4 x i32>
+  ret <4 x i32> %val
+}
+
+define <4 x float> @test_v8i16_to_v2f32(<8 x i16> %in) nounwind{
+; CHECK: test_v8i16_to_v2f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i16> %in to <4 x float>
+  ret <4 x float> %val
+}
+
+define <8 x i16> @test_v8i16_to_v8i16(<8 x i16> %in) nounwind{
+; CHECK: test_v8i16_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i16> %in to <8 x i16>
+  ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v8i16_to_v16i8(<8 x i16> %in) nounwind{
+; CHECK: test_v8i16_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <8 x i16> %in to <16 x i8>
+  ret <16 x i8> %val
+}
+
+; From <4 x i32>
+
+define <2 x double> @test_v4i32_to_v2f64(<4 x i32> %in) nounwind {
+; CHECK: test_v4i32_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i32> %in to <2 x double>
+  ret <2 x double> %val
+}
+
+define <2 x i64> @test_v4i32_to_v2i64(<4 x i32> %in) nounwind {
+; CHECK: test_v4i32_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i32> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v4i32_to_v4i32(<4 x i32> %in) nounwind {
+; CHECK: test_v4i32_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i32> %in to <4 x i32>
+  ret <4 x i32> %val
+}
+
+define <4 x float> @test_v4i32_to_v2f32(<4 x i32> %in) nounwind{
+; CHECK: test_v4i32_to_v2f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i32> %in to <4 x float>
+  ret <4 x float> %val
+}
+
+define <8 x i16> @test_v4i32_to_v8i16(<4 x i32> %in) nounwind{
+; CHECK: test_v4i32_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i32> %in to <8 x i16>
+  ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v4i32_to_v16i8(<4 x i32> %in) nounwind{
+; CHECK: test_v4i32_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x i32> %in to <16 x i8>
+  ret <16 x i8> %val
+}
+
+; From <4 x float>
+
+define <2 x double> @test_v4f32_to_v2f64(<4 x float> %in) nounwind {
+; CHECK: test_v4f32_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x float> %in to <2 x double>
+  ret <2 x double> %val
+}
+
+define <2 x i64> @test_v4f32_to_v2i64(<4 x float> %in) nounwind {
+; CHECK: test_v4f32_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x float> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v4f32_to_v4i32(<4 x float> %in) nounwind {
+; CHECK: test_v4f32_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x float> %in to <4 x i32>
+  ret <4 x i32> %val
+}
+
+define <4 x float> @test_v4f32_to_v4f32(<4 x float> %in) nounwind{
+; CHECK: test_v4f32_to_v4f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x float> %in to <4 x float>
+  ret <4 x float> %val
+}
+
+define <8 x i16> @test_v4f32_to_v8i16(<4 x float> %in) nounwind{
+; CHECK: test_v4f32_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x float> %in to <8 x i16>
+  ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v4f32_to_v16i8(<4 x float> %in) nounwind{
+; CHECK: test_v4f32_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <4 x float> %in to <16 x i8>
+  ret <16 x i8> %val
+}
+
+; From <2 x i64>
+
+define <2 x double> @test_v2i64_to_v2f64(<2 x i64> %in) nounwind {
+; CHECK: test_v2i64_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i64> %in to <2 x double>
+  ret <2 x double> %val
+}
+
+define <2 x i64> @test_v2i64_to_v2i64(<2 x i64> %in) nounwind {
+; CHECK: test_v2i64_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i64> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v2i64_to_v4i32(<2 x i64> %in) nounwind {
+; CHECK: test_v2i64_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i64> %in to <4 x i32>
+  ret <4 x i32> %val
+}
+
+define <4 x float> @test_v2i64_to_v4f32(<2 x i64> %in) nounwind{
+; CHECK: test_v2i64_to_v4f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i64> %in to <4 x float>
+  ret <4 x float> %val
+}
+
+define <8 x i16> @test_v2i64_to_v8i16(<2 x i64> %in) nounwind{
+; CHECK: test_v2i64_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i64> %in to <8 x i16>
+  ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v2i64_to_v16i8(<2 x i64> %in) nounwind{
+; CHECK: test_v2i64_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x i64> %in to <16 x i8>
+  ret <16 x i8> %val
+}
+
+; From <2 x double>
+
+define <2 x double> @test_v2f64_to_v2f64(<2 x double> %in) nounwind {
+; CHECK: test_v2f64_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x double> %in to <2 x double>
+  ret <2 x double> %val
+}
+
+define <2 x i64> @test_v2f64_to_v2i64(<2 x double> %in) nounwind {
+; CHECK: test_v2f64_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x double> %in to <2 x i64>
+  ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v2f64_to_v4i32(<2 x double> %in) nounwind {
+; CHECK: test_v2f64_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x double> %in to <4 x i32>
+  ret <4 x i32> %val
+}
+
+define <4 x float> @test_v2f64_to_v4f32(<2 x double> %in) nounwind{
+; CHECK: test_v2f64_to_v4f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x double> %in to <4 x float>
+  ret <4 x float> %val
+}
+
+define <8 x i16> @test_v2f64_to_v8i16(<2 x double> %in) nounwind{
+; CHECK: test_v2f64_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x double> %in to <8 x i16>
+  ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v2f64_to_v16i8(<2 x double> %in) nounwind{
+; CHECK: test_v2f64_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+  %val = bitcast <2 x double> %in to <16 x i8>
+  ret <16 x i8> %val
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
new file mode 100644
index 0000000..1c43b97
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -0,0 +1,594 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) {
+;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = and <8 x i8> %a, %b;
+	ret <8 x i8> %tmp1
+}
+
+define <16 x i8> @and16xi8(<16 x i8> %a, <16 x i8> %b) {
+;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = and <16 x i8> %a, %b;
+	ret <16 x i8> %tmp1
+}
+
+
+define <8 x i8> @orr8xi8(<8 x i8> %a, <8 x i8> %b) {
+;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = or <8 x i8> %a, %b;
+	ret <8 x i8> %tmp1
+}
+
+define <16 x i8> @orr16xi8(<16 x i8> %a, <16 x i8> %b) {
+;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = or <16 x i8> %a, %b;
+	ret <16 x i8> %tmp1
+}
+
+
+define <8 x i8> @xor8xi8(<8 x i8> %a, <8 x i8> %b) {
+;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = xor <8 x i8> %a, %b;
+	ret <8 x i8> %tmp1
+}
+
+define <16 x i8> @xor16xi8(<16 x i8> %a, <16 x i8> %b) {
+;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = xor <16 x i8> %a, %b;
+	ret <16 x i8> %tmp1
+}
+
+define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b)  {
+;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 >
+	%tmp3 = or <8 x i8> %tmp1, %tmp2
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
+;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+	%tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 >
+	%tmp3 = or <16 x i8> %tmp1, %tmp2
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @orn8xi8(<8 x i8> %a, <8 x i8> %b)  {
+;CHECK:  orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+  %tmp2 = or <8 x i8> %a, %tmp1
+  ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @orn16xi8(<16 x i8> %a, <16 x i8> %b) {
+;CHECK:  orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+  %tmp2 = or <16 x i8> %a, %tmp1
+  ret <16 x i8> %tmp2
+}
+
+define <8 x i8> @bic8xi8(<8 x i8> %a, <8 x i8> %b)  {
+;CHECK:  bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+  %tmp2 = and <8 x i8> %a, %tmp1
+  ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @bic16xi8(<16 x i8> %a, <16 x i8> %b) {
+;CHECK:  bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+  %tmp2 = and <16 x i8> %a, %tmp1
+  ret <16 x i8> %tmp2
+}
+
+define <2 x i32> @orrimm2s_lsl0(<2 x i32> %a) {
+;CHECK:  orr {{v[0-31]+}}.2s, #0xff
+	%tmp1 = or <2 x i32> %a, < i32 255, i32 255>
+	ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @orrimm2s_lsl8(<2 x i32> %a) {
+;CHECK:  orr {{v[0-31]+}}.2s, #0xff, lsl #8
+	%tmp1 = or <2 x i32> %a, < i32 65280, i32 65280>
+	ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @orrimm2s_lsl16(<2 x i32> %a) {
+;CHECK:  orr {{v[0-31]+}}.2s, #0xff, lsl #16
+	%tmp1 = or <2 x i32> %a, < i32 16711680, i32 16711680>
+	ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @orrimm2s_lsl24(<2 x i32> %a) {
+;CHECK:  orr {{v[0-31]+}}.2s, #0xff, lsl #24
+	%tmp1 = or <2 x i32> %a, < i32 4278190080, i32 4278190080>
+	ret <2 x i32> %tmp1
+}
+
+define <4 x i32> @orrimm4s_lsl0(<4 x i32> %a) {
+;CHECK:  orr {{v[0-31]+}}.4s, #0xff
+	%tmp1 = or <4 x i32> %a, < i32 255, i32 255, i32 255, i32 255>
+	ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @orrimm4s_lsl8(<4 x i32> %a) {
+;CHECK:  orr {{v[0-31]+}}.4s, #0xff, lsl #8
+	%tmp1 = or <4 x i32> %a, < i32 65280, i32 65280, i32 65280, i32 65280>
+	ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @orrimm4s_lsl16(<4 x i32> %a) {
+;CHECK:  orr {{v[0-31]+}}.4s, #0xff, lsl #16
+	%tmp1 = or <4 x i32> %a, < i32 16711680, i32 16711680, i32 16711680, i32 16711680>
+	ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @orrimm4s_lsl24(<4 x i32> %a) {
+;CHECK:  orr {{v[0-31]+}}.4s, #0xff, lsl #24
+	%tmp1 = or <4 x i32> %a, < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080>
+	ret <4 x i32> %tmp1
+}
+
+define <4 x i16> @orrimm4h_lsl0(<4 x i16> %a) {
+;CHECK:  orr {{v[0-31]+}}.4h, #0xff
+	%tmp1 = or <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255 >
+	ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @orrimm4h_lsl8(<4 x i16> %a) {
+;CHECK:  orr {{v[0-31]+}}.4h, #0xff, lsl #8
+	%tmp1 = or <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 >
+	ret <4 x i16> %tmp1
+}
+
+define <8 x i16> @orrimm8h_lsl0(<8 x i16> %a) {
+;CHECK:  orr {{v[0-31]+}}.8h, #0xff
+	%tmp1 = or <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
+	ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @orrimm8h_lsl8(<8 x i16> %a) {
+;CHECK:  orr {{v[0-31]+}}.8h, #0xff, lsl #8
+	%tmp1 = or <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
+	ret <8 x i16> %tmp1
+}
+
+define <2 x i32> @bicimm2s_lsl0(<2 x i32> %a) {
+;CHECK:  bic {{v[0-31]+}}.2s, #0x10
+	%tmp1 = and <2 x i32> %a, < i32 4294967279, i32 4294967279 >
+	ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @bicimm2s_lsl8(<2 x i32> %a) {
+;CHECK:  bic {{v[0-31]+}}.2s, #0x10, lsl #8
+	%tmp1 = and <2 x i32> %a, < i32 18446744073709547519, i32  18446744073709547519 >
+	ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @bicimm2s_lsl16(<2 x i32> %a) {
+;CHECK:  bic {{v[0-31]+}}.2s, #0x10, lsl #16
+	%tmp1 = and <2 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039 >
+	ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @bicimm2s_lsl124(<2 x i32> %a) {
+;CHECK:  bic {{v[0-31]+}}.2s, #0x10, lsl #24
+	%tmp1 = and <2 x i32> %a, < i32 18446744073441116159, i32  18446744073441116159>
+	ret <2 x i32> %tmp1
+}
+
+define <4 x i32> @bicimm4s_lsl0(<4 x i32> %a) {
+;CHECK:  bic {{v[0-31]+}}.4s, #0x10
+	%tmp1 = and <4 x i32> %a, < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
+	ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @bicimm4s_lsl8(<4 x i32> %a) {
+;CHECK:  bic {{v[0-31]+}}.4s, #0x10, lsl #8
+	%tmp1 = and <4 x i32> %a, < i32 18446744073709547519, i32  18446744073709547519, i32  18446744073709547519, i32  18446744073709547519 >
+	ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @bicimm4s_lsl16(<4 x i32> %a) {
+;CHECK:  bic {{v[0-31]+}}.4s, #0x10, lsl #16
+	%tmp1 = and <4 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039 >
+	ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @bicimm4s_lsl124(<4 x i32> %a) {
+;CHECK:  bic {{v[0-31]+}}.4s, #0x10, lsl #24
+	%tmp1 = and <4 x i32> %a, < i32 18446744073441116159, i32  18446744073441116159, i32  18446744073441116159, i32  18446744073441116159>
+	ret <4 x i32> %tmp1
+}
+
+define <4 x i16> @bicimm4h_lsl0_a(<4 x i16> %a) {
+;CHECK:  bic {{v[0-31]+}}.4h, #0x10
+	%tmp1 = and <4 x i16> %a, < i16 18446744073709551599, i16  18446744073709551599, i16  18446744073709551599, i16  18446744073709551599 >
+	ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @bicimm4h_lsl0_b(<4 x i16> %a) {
+;CHECK:  bic {{v[0-31]+}}.4h, #0x0
+	%tmp1 = and <4 x i16> %a, < i16 65280, i16  65280, i16  65280, i16 65280 >
+	ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @bicimm4h_lsl8_a(<4 x i16> %a) {
+;CHECK:  bic {{v[0-31]+}}.4h, #0x10, lsl #8
+	%tmp1 = and <4 x i16> %a, < i16 18446744073709547519, i16  18446744073709547519, i16  18446744073709547519, i16  18446744073709547519>
+	ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @bicimm4h_lsl8_b(<4 x i16> %a) {
+;CHECK:  bic {{v[0-31]+}}.4h, #0x0, lsl #8
+	%tmp1 = and <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255>
+	ret <4 x i16> %tmp1
+}
+
+define <8 x i16> @bicimm8h_lsl0_a(<8 x i16> %a) {
+;CHECK:  bic {{v[0-31]+}}.8h, #0x10
+	%tmp1 = and <8 x i16> %a, < i16 18446744073709551599, i16  18446744073709551599, i16  18446744073709551599, i16  18446744073709551599,
+   i16  18446744073709551599, i16  18446744073709551599, i16  18446744073709551599, i16  18446744073709551599 >
+	ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @bicimm8h_lsl0_b(<8 x i16> %a) {
+;CHECK:  bic {{v[0-31]+}}.8h, #0x0
+	%tmp1 = and <8 x i16> %a, < i16 65280, i16  65280, i16  65280, i16 65280, i16 65280, i16  65280, i16  65280, i16 65280 >
+	ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @bicimm8h_lsl8_a(<8 x i16> %a) {
+;CHECK:  bic {{v[0-31]+}}.8h, #0x10, lsl #8
+	%tmp1 = and <8 x i16> %a, < i16 18446744073709547519, i16  18446744073709547519, i16  18446744073709547519, i16  18446744073709547519,
+   i16  18446744073709547519, i16  18446744073709547519, i16  18446744073709547519, i16  18446744073709547519>
+	ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @bicimm8h_lsl8_b(<8 x i16> %a) {
+;CHECK:  bic {{v[0-31]+}}.8h, #0x0, lsl #8
+	%tmp1 = and <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+	ret <8 x i16> %tmp1
+}
+
+define <2 x i32> @and2xi32(<2 x i32> %a, <2 x i32> %b) {
+;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = and <2 x i32> %a, %b;
+	ret <2 x i32> %tmp1
+}
+
+define <4 x i16> @and4xi16(<4 x i16> %a, <4 x i16> %b) {
+;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = and <4 x i16> %a, %b;
+	ret <4 x i16> %tmp1
+}
+
+define <1 x i64> @and1xi64(<1 x i64> %a, <1 x i64> %b) {
+;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = and <1 x i64> %a, %b;
+	ret <1 x i64> %tmp1
+}
+
+define <4 x i32> @and4xi32(<4 x i32> %a, <4 x i32> %b) {
+;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = and <4 x i32> %a, %b;
+	ret <4 x i32> %tmp1
+}
+
+define <8 x i16> @and8xi16(<8 x i16> %a, <8 x i16> %b) {
+;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = and <8 x i16> %a, %b;
+	ret <8 x i16> %tmp1
+}
+
+define <2 x i64> @and2xi64(<2 x i64> %a, <2 x i64> %b) {
+;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = and <2 x i64> %a, %b;
+	ret <2 x i64> %tmp1
+}
+
+define <2 x i32> @orr2xi32(<2 x i32> %a, <2 x i32> %b) {
+;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = or <2 x i32> %a, %b;
+	ret <2 x i32> %tmp1
+}
+
+define <4 x i16> @orr4xi16(<4 x i16> %a, <4 x i16> %b) {
+;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = or <4 x i16> %a, %b;
+	ret <4 x i16> %tmp1
+}
+
+define <1 x i64> @orr1xi64(<1 x i64> %a, <1 x i64> %b) {
+;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = or <1 x i64> %a, %b;
+	ret <1 x i64> %tmp1
+}
+
+define <4 x i32> @orr4xi32(<4 x i32> %a, <4 x i32> %b) {
+;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = or <4 x i32> %a, %b;
+	ret <4 x i32> %tmp1
+}
+
+define <8 x i16> @orr8xi16(<8 x i16> %a, <8 x i16> %b) {
+;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = or <8 x i16> %a, %b;
+	ret <8 x i16> %tmp1
+}
+
+define <2 x i64> @orr2xi64(<2 x i64> %a, <2 x i64> %b) {
+;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = or <2 x i64> %a, %b;
+	ret <2 x i64> %tmp1
+}
+
+define <2 x i32> @eor2xi32(<2 x i32> %a, <2 x i32> %b) {
+;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = xor <2 x i32> %a, %b;
+	ret <2 x i32> %tmp1
+}
+
+define <4 x i16> @eor4xi16(<4 x i16> %a, <4 x i16> %b) {
+;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = xor <4 x i16> %a, %b;
+	ret <4 x i16> %tmp1
+}
+
+define <1 x i64> @eor1xi64(<1 x i64> %a, <1 x i64> %b) {
+;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = xor <1 x i64> %a, %b;
+	ret <1 x i64> %tmp1
+}
+
+define <4 x i32> @eor4xi32(<4 x i32> %a, <4 x i32> %b) {
+;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = xor <4 x i32> %a, %b;
+	ret <4 x i32> %tmp1
+}
+
+define <8 x i16> @eor8xi16(<8 x i16> %a, <8 x i16> %b) {
+;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = xor <8 x i16> %a, %b;
+	ret <8 x i16> %tmp1
+}
+
+define <2 x i64> @eor2xi64(<2 x i64> %a, <2 x i64> %b) {
+;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = xor <2 x i64> %a, %b;
+	ret <2 x i64> %tmp1
+}
+
+
+define <2 x i32> @bic2xi32(<2 x i32> %a, <2 x i32> %b)  {
+;CHECK:  bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
+  %tmp2 = and <2 x i32> %a, %tmp1
+  ret <2 x i32> %tmp2
+}
+
+define <4 x i16> @bic4xi16(<4 x i16> %a, <4 x i16> %b)  {
+;CHECK:  bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
+  %tmp2 = and <4 x i16> %a, %tmp1
+  ret <4 x i16> %tmp2
+}
+
+define <1 x i64> @bic1xi64(<1 x i64> %a, <1 x i64> %b)  {
+;CHECK:  bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %tmp1 = xor <1 x i64> %b, < i64 -1>
+  %tmp2 = and <1 x i64> %a, %tmp1
+  ret <1 x i64> %tmp2
+}
+
+define <4 x i32> @bic4xi32(<4 x i32> %a, <4 x i32> %b)  {
+;CHECK:  bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
+  %tmp2 = and <4 x i32> %a, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+define <8 x i16> @bic8xi16(<8 x i16> %a, <8 x i16> %b)  {
+;CHECK:  bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
+  %tmp2 = and <8 x i16> %a, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+define <2 x i64> @bic2xi64(<2 x i64> %a, <2 x i64> %b)  {
+;CHECK:  bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
+  %tmp2 = and <2 x i64> %a, %tmp1
+  ret <2 x i64> %tmp2
+}
+
+define <2 x i32> @orn2xi32(<2 x i32> %a, <2 x i32> %b)  {
+;CHECK:  orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
+  %tmp2 = or <2 x i32> %a, %tmp1
+  ret <2 x i32> %tmp2
+}
+
+define <4 x i16> @orn4xi16(<4 x i16> %a, <4 x i16> %b)  {
+;CHECK:  orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
+  %tmp2 = or <4 x i16> %a, %tmp1
+  ret <4 x i16> %tmp2
+}
+
+define <1 x i64> @orn1xi64(<1 x i64> %a, <1 x i64> %b)  {
+;CHECK:  orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %tmp1 = xor <1 x i64> %b, < i64 -1>
+  %tmp2 = or <1 x i64> %a, %tmp1
+  ret <1 x i64> %tmp2
+}
+
+define <4 x i32> @orn4xi32(<4 x i32> %a, <4 x i32> %b)  {
+;CHECK:  orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
+  %tmp2 = or <4 x i32> %a, %tmp1
+  ret <4 x i32> %tmp2
+}
+
+define <8 x i16> @orn8xi16(<8 x i16> %a, <8 x i16> %b)  {
+;CHECK:  orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
+  %tmp2 = or <8 x i16> %a, %tmp1
+  ret <8 x i16> %tmp2
+}
+
+define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b)  {
+;CHECK:  orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
+  %tmp2 = or <2 x i64> %a, %tmp1
+  ret <2 x i64> %tmp2
+}
+define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b)  {
+;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = and <2 x i32> %a, < i32 -1, i32 -1 >
+	%tmp2 = and <2 x i32> %b, < i32 0, i32 0 >
+	%tmp3 = or <2 x i32> %tmp1, %tmp2
+	ret <2 x i32> %tmp3
+}
+
+
+define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b)  {
+;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = and <4 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1 >
+	%tmp2 = and <4 x i16> %b, < i16 0, i16 0,i16 0, i16 0 >
+	%tmp3 = or <4 x i16> %tmp1, %tmp2
+	ret <4 x i16> %tmp3
+}
+
+define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b)  {
+;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = and <1 x i64> %a, < i64 -1 >
+	%tmp2 = and <1 x i64> %b, < i64 0 >
+	%tmp3 = or <1 x i64> %tmp1, %tmp2
+	ret <1 x i64> %tmp3
+}
+
+define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b)  {
+;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = and <4 x i32> %a, < i32 -1, i32 -1, i32 -1, i32 -1 >
+	%tmp2 = and <4 x i32> %b, < i32 0, i32 0, i32 0, i32 0 >
+	%tmp3 = or <4 x i32> %tmp1, %tmp2
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b)  {
+;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1, i16 -1, i16 -1, i16 -1,i16 -1 >
+	%tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0 >
+	%tmp3 = or <8 x i16> %tmp1, %tmp2
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b)  {
+;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = and <2 x i64> %a, < i64 -1, i64 -1 >
+	%tmp2 = and <2 x i64> %b, < i64 0, i64 0 >
+	%tmp3 = or <2 x i64> %tmp1, %tmp2
+	ret <2 x i64> %tmp3
+}
+
+
+define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %1 = and <8 x i8> %v1, %v2
+  %2 = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <8 x i8> %2, %v3
+  %4 = or <8 x i8> %1, %3
+  ret <8 x i8> %4
+}
+
+define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
+;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %1 = and <4 x i16> %v1, %v2
+  %2 = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
+  %3 = and <4 x i16> %2, %v3
+  %4 = or <4 x i16> %1, %3
+  ret <4 x i16> %4
+}
+
+define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
+;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %1 = and <2 x i32> %v1, %v2
+  %2 = xor <2 x i32> %v1, <i32 -1, i32 -1>
+  %3 = and <2 x i32> %2, %v3
+  %4 = or <2 x i32> %1, %3
+  ret <2 x i32> %4
+}
+
+define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+;CHECK:  bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+  %1 = and <1 x i64> %v1, %v2
+  %2 = xor <1 x i64> %v1, <i64 -1>
+  %3 = and <1 x i64> %2, %v3
+  %4 = or <1 x i64> %1, %3
+  ret <1 x i64> %4
+}
+
+define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
+;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %1 = and <16 x i8> %v1, %v2
+  %2 = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %3 = and <16 x i8> %2, %v3
+  %4 = or <16 x i8> %1, %3
+  ret <16 x i8> %4
+}
+
+define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
+;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %1 = and <8 x i16> %v1, %v2
+  %2 = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %3 = and <8 x i16> %2, %v3
+  %4 = or <8 x i16> %1, %3
+  ret <8 x i16> %4
+}
+
+define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %1 = and <4 x i32> %v1, %v2
+  %2 = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %3 = and <4 x i32> %2, %v3
+  %4 = or <4 x i32> %1, %3
+  ret <4 x i32> %4
+}
+
+define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
+;CHECK:  bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+  %1 = and <2 x i64> %v1, %v2
+  %2 = xor <2 x i64> %v1, <i64 -1, i64 -1>
+  %3 = and <2 x i64> %2, %v3
+  %4 = or <2 x i64> %1, %3
+  ret <2 x i64> %4
+}
+
+define <8 x i8> @orrimm8b_as_orrimm4h_lsl0(<8 x i8> %a) {
+;CHECK:  orr {{v[0-31]+}}.4h, #0xff
+  %val = or <8 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
+  ret <8 x i8> %val
+}
+
+define <8 x i8> @orrimm8b_as_orimm4h_lsl8(<8 x i8> %a) {
+;CHECK:  orr {{v[0-31]+}}.4h, #0xff, lsl #8
+  %val = or <8 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
+  ret <8 x i8> %val
+}
+
+define <16 x i8> @orimm16b_as_orrimm8h_lsl0(<16 x i8> %a) {
+;CHECK:  orr {{v[0-31]+}}.8h, #0xff
+  %val = or <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
+  ret <16 x i8> %val
+}
+
+define <16 x i8> @orimm16b_as_orrimm8h_lsl8(<16 x i8> %a) {
+;CHECK:  orr {{v[0-31]+}}.8h, #0xff, lsl #8
+  %val = or <16 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
+  ret <16 x i8> %val
+}
+
+
diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
new file mode 100644
index 0000000..0848f9b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -0,0 +1,1982 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp eq <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp eq <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp eq <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp eq <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp eq <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp eq <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp eq <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sgt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sgt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sgt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sgt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sgt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sgt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sgt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp slt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp slt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp slt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp slt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp slt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp slt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp slt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp sle <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp sle <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp sle <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp sle <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp sle <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp sle <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp sle <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ugt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ugt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp ugt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp ugt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp ugt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp ugt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp ugt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ult <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp uge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp uge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp uge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp uge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp uge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp uge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp uge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = and <8 x i8> %A, %B
+	%tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
+   %tmp5 = sext <8 x i1> %tmp4 to <8 x i8>
+	ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = and <16 x i8> %A, %B
+	%tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer
+   %tmp5 = sext <16 x i1> %tmp4 to <16 x i8>
+	ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = and <4 x i16> %A, %B
+	%tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer
+   %tmp5 = sext <4 x i1> %tmp4 to <4 x i16>
+	ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = and <8 x i16> %A, %B
+	%tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer
+   %tmp5 = sext <8 x i1> %tmp4 to <8 x i16>
+	ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = and <2 x i32> %A, %B
+	%tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer
+   %tmp5 = sext <2 x i1> %tmp4 to <2 x i32>
+	ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = and <4 x i32> %A, %B
+	%tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer
+   %tmp5 = sext <4 x i1> %tmp4 to <4 x i32>
+	ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = and <2 x i64> %A, %B
+	%tmp4 = icmp ne <2 x i64> %tmp3, zeroinitializer
+   %tmp5 = sext <2 x i1> %tmp4 to <2 x i64>
+	ret <2 x i64> %tmp5
+}
+
+
+
+define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+	%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+	%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+	%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+	%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+	%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+	%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+	%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+	%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+	%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+	%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+	%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+	%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+	%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+	%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+	%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+	%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+	%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+	%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+	%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+	%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+	%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+	%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+	%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+	%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+	%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+	%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+	%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
+;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+	%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+	%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+	%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+	%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+	%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+	%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+	%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+	%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmoeq2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+   %tmp3 = fcmp oeq <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmoeq4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+   %tmp3 = fcmp oeq <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmoeq2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+   %tmp3 = fcmp oeq <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmoge2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+   %tmp3 = fcmp oge <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmoge4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+   %tmp3 = fcmp oge <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmoge2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+   %tmp3 = fcmp oge <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmogt2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+   %tmp3 = fcmp ogt <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmogt4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+   %tmp3 = fcmp ogt <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmogt2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+   %tmp3 = fcmp ogt <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmole2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+   %tmp3 = fcmp ole <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmole4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+   %tmp3 = fcmp ole <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmole2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+   %tmp3 = fcmp ole <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmolt2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+   %tmp3 = fcmp olt <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmolt4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+   %tmp3 = fcmp olt <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmolt2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+   %tmp3 = fcmp olt <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmone2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp one <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmone4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp one <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; todo check reversed operands
+   %tmp3 = fcmp one <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp ord <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+
+define <4 x i32> @fcmord4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ord <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ord <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmuno2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp uno <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmuno4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp uno <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmuno2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp uno <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmueq2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp ueq <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmueq4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ueq <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmueq2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ueq <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmuge2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UGE = ULE with swapped operands, ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp uge <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmuge4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UGE = ULE with swapped operands, ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp uge <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmuge2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UGE = ULE with swapped operands, ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp uge <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmugt2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UGT = ULT with swapped operands, ULT implemented as !OGE.
+;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp ugt <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmugt4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UGT = ULT with swapped operands, ULT implemented as !OGE.
+;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ugt <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmugt2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ugt <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmule2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp ule <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmule4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ule <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmule2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ule <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmult2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULT implemented as !OGE.
+;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp ult <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmult4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULT implemented as !OGE.
+;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ult <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmult2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULT implemented as !OGE.
+;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ult <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmune2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNE = !OEQ.
+;CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp une <2 x float> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmune4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNE = !OEQ.
+;CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp une <4 x float> %A, %B
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmune2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNE = !OEQ.
+;CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp une <2 x double> %A, %B
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmoeqz2xfloat(<2 x float> %A) {
+;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+   %tmp3 = fcmp oeq <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmoeqz4xfloat(<4 x float> %A) {
+;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+   %tmp3 = fcmp oeq <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) {
+;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+   %tmp3 = fcmp oeq <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmogez2xfloat(<2 x float> %A) {
+;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+   %tmp3 = fcmp oge <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmogez4xfloat(<4 x float> %A) {
+;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+   %tmp3 = fcmp oge <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmogez2xdouble(<2 x double> %A) {
+;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+   %tmp3 = fcmp oge <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmogtz2xfloat(<2 x float> %A) {
+;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+   %tmp3 = fcmp ogt <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmogtz4xfloat(<4 x float> %A) {
+;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+   %tmp3 = fcmp ogt <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmogtz2xdouble(<2 x double> %A) {
+;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+   %tmp3 = fcmp ogt <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmoltz2xfloat(<2 x float> %A) {
+;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+   %tmp3 = fcmp olt <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmoltz4xfloat(<4 x float> %A) {
+;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+   %tmp3 = fcmp olt <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmoltz2xdouble(<2 x double> %A) {
+;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+   %tmp3 = fcmp olt <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmolez2xfloat(<2 x float> %A) {
+;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+   %tmp3 = fcmp ole <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmolez4xfloat(<4 x float> %A) {
+;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+   %tmp3 = fcmp ole <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmolez2xdouble(<2 x double> %A) {
+;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+   %tmp3 = fcmp ole <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmonez2xfloat(<2 x float> %A) {
+; ONE with zero = OLT | OGT
+;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp one <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmonez4xfloat(<4 x float> %A) {
+; ONE with zero = OLT | OGT
+;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp one <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmonez2xdouble(<2 x double> %A) {
+; ONE with zero = OLT | OGT
+;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp one <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmordz2xfloat(<2 x float> %A) {
+; ORD with zero = OLT | OGE
+;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp ord <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmordz4xfloat(<4 x float> %A) {
+; ORD with zero = OLT | OGE
+;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ord <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmordz2xdouble(<2 x double> %A) {
+; ORD with zero = OLT | OGE
+;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ord <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmueqz2xfloat(<2 x float> %A) {
+; UEQ with zero = !ONE = !(OLT |OGT)
+;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp ueq <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmueqz4xfloat(<4 x float> %A) {
+; UEQ with zero = !ONE = !(OLT |OGT)
+;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ueq <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) {
+; UEQ with zero = !ONE = !(OLT |OGT)
+;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ueq <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmugez2xfloat(<2 x float> %A) {
+; UGE with zero = !OLT
+;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp uge <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmugez4xfloat(<4 x float> %A) {
+; UGE with zero = !OLT
+;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp uge <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmugez2xdouble(<2 x double> %A) {
+; UGE with zero = !OLT
+;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp uge <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) {
+; UGT with zero = !OLE
+;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp ugt <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) {
+; UGT with zero = !OLE
+;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ugt <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) {
+; UGT with zero = !OLE
+;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ugt <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmultz2xfloat(<2 x float> %A) {
+; ULT with zero = !OGE
+;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp ult <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmultz4xfloat(<4 x float> %A) {
+;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ult <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
+;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ult <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmulez2xfloat(<2 x float> %A) {
+; ULE with zero = !OGT
+;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp ule <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmulez4xfloat(<4 x float> %A) {
+; ULE with zero = !OGT
+;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ule <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmulez2xdouble(<2 x double> %A) {
+; ULE with zero = !OGT
+;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp ule <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmunez2xfloat(<2 x float> %A) {
+; UNE with zero = !OEQ with zero
+;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp une <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmunez4xfloat(<4 x float> %A) {
+; UNE with zero = !OEQ with zero
+;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp une <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmunez2xdouble(<2 x double> %A) {
+; UNE with zero = !OEQ with zero
+;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp une <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmunoz2xfloat(<2 x float> %A) {
+; UNO with zero = !ORD = !(OLT | OGE)
+;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+   %tmp3 = fcmp uno <2 x float> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmunoz4xfloat(<4 x float> %A) {
+; UNO with zero = !ORD = !(OLT | OGE)
+;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp uno <4 x float> %A, zeroinitializer
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) {
+; UNO with zero = !ORD = !(OLT | OGE)
+;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+   %tmp3 = fcmp uno <2 x double> %A, zeroinitializer
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-facge-facgt.ll b/llvm/test/CodeGen/AArch64/neon-facge-facgt.ll
new file mode 100644
index 0000000..146256e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-facge-facgt.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>)
+declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>)
+declare <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double>, <2 x double>)
+
+define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facge_from_intr_v2i32:
+  %val = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %A, <2 x float> %B)
+; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  ret <2 x i32> %val
+}
+define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facge_from_intr_v4i32:
+  %val = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %A, <4 x float> %B)
+; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+  ret <4 x i32> %val
+}
+
+define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facge_from_intr_v2i64:
+  %val = call <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double> %A, <2 x double> %B)
+; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+  ret <2 x i64> %val
+}
+
+declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>)
+declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>)
+declare <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double>, <2 x double>)
+
+define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facgt_from_intr_v2i32:
+  %val = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %A, <2 x float> %B)
+; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+  ret <2 x i32> %val
+}
+define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facgt_from_intr_v4i32:
+  %val = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %A, <4 x float> %B)
+; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+  ret <4 x i32> %val
+}
+
+define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facgt_from_intr_v2i64:
+  %val = call <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double> %A, <2 x double> %B)
+; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+  ret <2 x i64> %val
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-fma.ll b/llvm/test/CodeGen/AArch64/neon-fma.ll
new file mode 100644
index 0000000..dcf4e28
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-fma.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp1 = fmul <2 x float> %A, %B;
+	%tmp2 = fadd <2 x float> %C, %tmp1;
+	ret <2 x float> %tmp2
+}
+
+define <4 x float> @fmla4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp1 = fmul <4 x float> %A, %B;
+	%tmp2 = fadd <4 x float> %C, %tmp1;
+	ret <4 x float> %tmp2
+}
+
+define <2 x double> @fmla2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+	%tmp1 = fmul <2 x double> %A, %B;
+	%tmp2 = fadd <2 x double> %C, %tmp1;
+	ret <2 x double> %tmp2
+}
+
+
+define <2 x float> @fmls2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp1 = fmul <2 x float> %A, %B;
+	%tmp2 = fsub <2 x float> %C, %tmp1;
+	ret <2 x float> %tmp2
+}
+
+define <4 x float> @fmls4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp1 = fmul <4 x float> %A, %B;
+	%tmp2 = fsub <4 x float> %C, %tmp1;
+	ret <4 x float> %tmp2
+}
+
+define <2 x double> @fmls2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+	%tmp1 = fmul <2 x double> %A, %B;
+	%tmp2 = fsub <2 x double> %C, %tmp1;
+	ret <2 x double> %tmp2
+}
+
+
+; Another set of tests for when the intrinsic is used.
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x float> @fmla2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+        %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C)
+	ret <2 x float> %val
+}
+
+define <4 x float> @fmla4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+        %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C)
+	ret <4 x float> %val
+}
+
+define <2 x double> @fmla2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+        %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C)
+	ret <2 x double> %val
+}
+
+define <2 x float> @fmls2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+        %negA = fsub <2 x float> <float -0.0, float -0.0>, %A
+        %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %negA, <2 x float> %B, <2 x float> %C)
+	ret <2 x float> %val
+}
+
+define <4 x float> @fmls4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+        %negA = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %A
+        %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %negA, <4 x float> %B, <4 x float> %C)
+	ret <4 x float> %val
+}
+
+define <2 x double> @fmls2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+        %negA = fsub <2 x double> <double -0.0, double -0.0>, %A
+        %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %negA, <2 x double> %B, <2 x double> %C)
+	ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>)
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x float> @fmuladd2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+        %val = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C)
+	ret <2 x float> %val
+}
+
+define <4 x float> @fmuladd4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+        %val = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C)
+	ret <4 x float> %val
+}
+
+define <2 x double> @fmuladd2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+        %val = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C)
+	ret <2 x double> %val
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-frsqrt-frecp.ll b/llvm/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
new file mode 100644
index 0000000..46fe25d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon  | FileCheck %s
+
+; Set of tests for when the intrinsic is used.
+
+declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @frsqrts_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frsqrts v0.2s, v0.2s, v1.2s
+        %val = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        ret <2 x float> %val
+}
+
+define <4 x float> @frsqrts_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frsqrts v0.4s, v0.4s, v1.4s
+        %val = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        ret <4 x float> %val
+}
+
+define <2 x double> @frsqrts_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frsqrts v0.2d, v0.2d, v1.2d
+        %val = call <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @frecps_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frecps v0.2s, v0.2s, v1.2s
+        %val = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        ret <2 x float> %val
+}
+
+define <4 x float> @frecps_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frecps v0.4s, v0.4s, v1.4s
+        %val = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        ret <4 x float> %val
+}
+
+define <2 x double> @frecps_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frecps v0.2d, v0.2d, v1.2d
+        %val = call <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        ret <2 x double> %val
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-halving-add-sub.ll b/llvm/test/CodeGen/AArch64/neon-halving-add-sub.ll
new file mode 100644
index 0000000..a8f59db
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-halving-add-sub.ll
@@ -0,0 +1,207 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uhadd_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uhadd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_shadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_shadd_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: shadd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uhadd_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uhadd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_shadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_shadd_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: shadd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uhadd_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uhadd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_shadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_shadd_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: shadd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uhadd_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uhadd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_shadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_shadd_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: shadd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uhadd_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uhadd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_shadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_shadd_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: shadd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uhadd_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uhadd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_shadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_shadd_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: shadd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+
+declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uhsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uhsub_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uhsub v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_shsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_shsub_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: shsub v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uhsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uhsub_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uhsub v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_shsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_shsub_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: shsub v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uhsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uhsub_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uhsub v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_shsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_shsub_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: shsub v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uhsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uhsub_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uhsub v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_shsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_shsub_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: shsub v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uhsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uhsub_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uhsub v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_shsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_shsub_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: shsub v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uhsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uhsub_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uhsub v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_shsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_shsub_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: shsub v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/llvm/test/CodeGen/AArch64/neon-max-min-pairwise.ll
new file mode 100644
index 0000000..d757aca
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-max-min-pairwise.ll
@@ -0,0 +1,310 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_smaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_smaxp_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: smaxp v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_umaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: umaxp v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_smaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_smaxp_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: smaxp v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_umaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_umaxp_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: umaxp v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_smaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_smaxp_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: smaxp v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_umaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_umaxp_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: umaxp v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+
+declare <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_smaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_smaxp_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: smaxp v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_umaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_umaxp_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: umaxp v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_smaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_smaxp_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: smaxp v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_umaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_umaxp_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: umaxp v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_smaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_smaxp_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: smaxp v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_umaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_umaxp_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: umaxp v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_sminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_sminp_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sminp v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_uminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uminp v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_sminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sminp_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sminp v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_uminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uminp_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uminp v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_sminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sminp_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sminp v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_uminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uminp_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uminp v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+
+declare <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_sminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sminp_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sminp v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_uminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uminp_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uminp v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_sminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sminp_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sminp v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_uminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uminp_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uminp v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_sminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sminp_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sminp v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_uminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uminp_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uminp v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fmaxp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fmaxp_v2f32:
+        %val = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fmaxp v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_fmaxp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fmaxp_v4f32:
+        %val = call <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fmaxp v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_fmaxp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fmaxp_v2f64:
+        %val = call <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fmaxp v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fminp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fminp_v2f32:
+        %val = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fminp v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_fminp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fminp_v4f32:
+        %val = call <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fminp v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_fminp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fminp_v2f64:
+        %val = call <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fminp v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fmaxnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fmaxnmp_v2f32:
+        %val = call <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fmaxnmp v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_fmaxnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fmaxnmp_v4f32:
+        %val = call <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fmaxnmp v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_fmaxnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fmaxnmp_v2f64:
+        %val = call <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fmaxnmp v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fminnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fminnmp_v2f32:
+        %val = call <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fminnmp v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_fminnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fminnmp_v4f32:
+        %val = call <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fminnmp v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fminnmp_v2f64:
+        %val = call <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fminnmp v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-max-min.ll b/llvm/test/CodeGen/AArch64/neon-max-min.ll
new file mode 100644
index 0000000..7889c77
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-max-min.ll
@@ -0,0 +1,310 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_smax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_smax_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: smax v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_umax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: umax v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_smax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_smax_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: smax v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_umax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_umax_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: umax v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_smax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_smax_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: smax v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_umax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_umax_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: umax v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+
+declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_smax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_smax_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: smax v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_umax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_umax_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: umax v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+
+declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_smax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_smax_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: smax v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_umax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_umax_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: umax v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_smax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_smax_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: smax v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_umax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_umax_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: umax v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_smin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_smin_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: smin v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_umin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: umin v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_smin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_smin_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: smin v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_umin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_umin_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: umin v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_smin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_smin_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: smin v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_umin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_umin_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: umin v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+
+declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_smin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_smin_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: smin v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_umin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_umin_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: umin v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+
+declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_smin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_smin_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: smin v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_umin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_umin_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: umin v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_smin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_smin_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: smin v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_umin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_umin_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: umin v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fmax_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fmax_v2f32:
+        %val = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fmax v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_fmax_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fmax_v4f32:
+        %val = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fmax v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_fmax_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fmax_v2f64:
+        %val = call <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fmax v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fmin_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fmin_v2f32:
+        %val = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fmin v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_fmin_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fmin_v4f32:
+        %val = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fmin v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_fmin_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fmin_v2f64:
+        %val = call <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fmin v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
+
+declare <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fmaxnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fmaxnm_v2f32:
+        %val = call <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fmaxnm v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_fmaxnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fmaxnm_v4f32:
+        %val = call <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fmaxnm v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_fmaxnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fmaxnm_v2f64:
+        %val = call <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fmaxnm v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fminnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fminnm_v2f32:
+        %val = call <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fminnm v0.2s, v0.2s, v1.2s
+        ret <2 x float> %val
+}
+
+define <4 x float> @test_fminnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fminnm_v4f32:
+        %val = call <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fminnm v0.4s, v0.4s, v1.4s
+        ret <4 x float> %val
+}
+
+define <2 x double> @test_fminnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fminnm_v2f64:
+        %val = call <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fminnm v0.2d, v0.2d, v1.2d
+        ret <2 x double> %val
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-mla-mls.ll b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll
new file mode 100644
index 0000000..23e9223
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
+;CHECK: mla {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = mul <8 x i8> %A, %B;
+	%tmp2 = add <8 x i8> %C, %tmp1;
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @mla16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+;CHECK: mla {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = mul <16 x i8> %A, %B;
+	%tmp2 = add <16 x i8> %C, %tmp1;
+	ret <16 x i8> %tmp2
+}
+
+define <4 x i16> @mla4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
+;CHECK: mla {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+	%tmp1 = mul <4 x i16> %A, %B;
+	%tmp2 = add <4 x i16> %C, %tmp1;
+	ret <4 x i16> %tmp2
+}
+
+define <8 x i16> @mla8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
+;CHECK: mla {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+	%tmp1 = mul <8 x i16> %A, %B;
+	%tmp2 = add <8 x i16> %C, %tmp1;
+	ret <8 x i16> %tmp2
+}
+
+define <2 x i32> @mla2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+;CHECK: mla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp1 = mul <2 x i32> %A, %B;
+	%tmp2 = add <2 x i32> %C, %tmp1;
+	ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @mla4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+;CHECK: mla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp1 = mul <4 x i32> %A, %B;
+	%tmp2 = add <4 x i32> %C, %tmp1;
+	ret <4 x i32> %tmp2
+}
+
+define <8 x i8> @mls8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
+;CHECK: mls {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp1 = mul <8 x i8> %A, %B;
+	%tmp2 = sub <8 x i8> %C, %tmp1;
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @mls16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+;CHECK: mls {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp1 = mul <16 x i8> %A, %B;
+	%tmp2 = sub <16 x i8> %C, %tmp1;
+	ret <16 x i8> %tmp2
+}
+
+define <4 x i16> @mls4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
+;CHECK: mls {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+	%tmp1 = mul <4 x i16> %A, %B;
+	%tmp2 = sub <4 x i16> %C, %tmp1;
+	ret <4 x i16> %tmp2
+}
+
+define <8 x i16> @mls8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
+;CHECK: mls {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+	%tmp1 = mul <8 x i16> %A, %B;
+	%tmp2 = sub <8 x i16> %C, %tmp1;
+	ret <8 x i16> %tmp2
+}
+
+define <2 x i32> @mls2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+;CHECK: mls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp1 = mul <2 x i32> %A, %B;
+	%tmp2 = sub <2 x i32> %C, %tmp1;
+	ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @mls4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+;CHECK: mls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp1 = mul <4 x i32> %A, %B;
+	%tmp2 = sub <4 x i32> %C, %tmp1;
+	ret <4 x i32> %tmp2
+}
+
+
diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll
new file mode 100644
index 0000000..42f6a89
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-mov.ll
@@ -0,0 +1,205 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @movi8b() {
+;CHECK:  movi {{v[0-31]+}}.8b, #0x8
+   ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+}
+
+define <16 x i8> @movi16b() {
+;CHECK:  movi {{v[0-31]+}}.16b, #0x8
+   ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+}
+
+define <2 x i32> @movi2s_lsl0() {
+;CHECK:  movi {{v[0-31]+}}.2s, #0xff
+   ret <2 x i32> < i32 255, i32 255 >
+}
+
+define <2 x i32> @movi2s_lsl8() {
+;CHECK:  movi {{v[0-31]+}}.2s, #0xff, lsl #8
+   ret <2 x i32> < i32 65280, i32 65280 >
+}
+
+define <2 x i32> @movi2s_lsl16() {
+;CHECK:  movi {{v[0-31]+}}.2s, #0xff, lsl #16
+   ret <2 x i32> < i32 16711680, i32 16711680 >
+
+}
+
+define <2 x i32> @movi2s_lsl24() {
+;CHECK:  movi {{v[0-31]+}}.2s, #0xff, lsl #24
+   ret <2 x i32> < i32 4278190080, i32 4278190080 >
+}
+
+define <4 x i32> @movi4s_lsl0() {
+;CHECK:  movi {{v[0-31]+}}.4s, #0xff
+   ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 >
+}
+
+define <4 x i32> @movi4s_lsl8() {
+;CHECK:  movi {{v[0-31]+}}.4s, #0xff, lsl #8
+   ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 >
+}
+
+define <4 x i32> @movi4s_lsl16() {
+;CHECK:  movi {{v[0-31]+}}.4s, #0xff, lsl #16
+   ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 >
+
+}
+
+define <4 x i32> @movi4s_lsl24() {
+;CHECK:  movi {{v[0-31]+}}.4s, #0xff, lsl #24
+   ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 >
+}
+
+define <4 x i16> @movi4h_lsl0() {
+;CHECK:  movi {{v[0-31]+}}.4h, #0xff
+   ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 >
+}
+
+define <4 x i16> @movi4h_lsl8() {
+;CHECK:  movi {{v[0-31]+}}.4h, #0xff, lsl #8
+   ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 >
+}
+
+define <8 x i16> @movi8h_lsl0() {
+;CHECK:  movi {{v[0-31]+}}.8h, #0xff
+   ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
+}
+
+define <8 x i16> @movi8h_lsl8() {
+;CHECK:  movi {{v[0-31]+}}.8h, #0xff, lsl #8
+   ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
+}
+
+
+define <2 x i32> @mvni2s_lsl0() {
+;CHECK:  mvni {{v[0-31]+}}.2s, #0x10
+   ret <2 x i32> < i32 4294967279, i32 4294967279 >
+}
+
+define <2 x i32> @mvni2s_lsl8() {
+;CHECK:  mvni {{v[0-31]+}}.2s, #0x10, lsl #8
+   ret <2 x i32> < i32 4294963199, i32 4294963199 >
+}
+
+define <2 x i32> @mvni2s_lsl16() {
+;CHECK:  mvni {{v[0-31]+}}.2s, #0x10, lsl #16
+   ret <2 x i32> < i32 4293918719, i32 4293918719 >
+}
+
+define <2 x i32> @mvni2s_lsl24() {
+;CHECK:  mvni {{v[0-31]+}}.2s, #0x10, lsl #24
+   ret <2 x i32> < i32 4026531839, i32 4026531839 >
+}
+
+define <4 x i32> @mvni4s_lsl0() {
+;CHECK:  mvni {{v[0-31]+}}.4s, #0x10
+   ret <4 x i32> < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
+}
+
+define <4 x i32> @mvni4s_lsl8() {
+;CHECK:  mvni {{v[0-31]+}}.4s, #0x10, lsl #8
+   ret <4 x i32> < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 >
+}
+
+define <4 x i32> @mvni4s_lsl16() {
+;CHECK:  mvni {{v[0-31]+}}.4s, #0x10, lsl #16
+   ret <4 x i32> < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 >
+
+}
+
+define <4 x i32> @mvni4s_lsl24() {
+;CHECK:  mvni {{v[0-31]+}}.4s, #0x10, lsl #24
+   ret <4 x i32> < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839 >
+}
+
+
+define <4 x i16> @mvni4h_lsl0() {
+;CHECK:  mvni {{v[0-31]+}}.4h, #0x10
+   ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 >
+}
+
+define <4 x i16> @mvni4h_lsl8() {
+;CHECK:  mvni {{v[0-31]+}}.4h, #0x10, lsl #8
+   ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 >
+}
+
+define <8 x i16> @mvni8h_lsl0() {
+;CHECK:  mvni {{v[0-31]+}}.8h, #0x10
+   ret <8 x i16> < i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519 >
+}
+
+define <8 x i16> @mvni8h_lsl8() {
+;CHECK:  mvni {{v[0-31]+}}.8h, #0x10, lsl #8
+   ret <8 x i16> < i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439 >
+}
+
+
+define <2 x i32> @movi2s_msl8(<2 x i32> %a) {
+;CHECK:  movi {{v[0-31]+}}.2s, #0xff, msl #8
+	ret <2 x i32> < i32 65535, i32 65535 >
+}
+
+define <2 x i32> @movi2s_msl16() {
+;CHECK:  movi {{v[0-31]+}}.2s, #0xff, msl #16
+   ret <2 x i32> < i32 16777215, i32 16777215 >
+}
+
+
+define <4 x i32> @movi4s_msl8() {
+;CHECK:  movi {{v[0-31]+}}.4s, #0xff, msl #8
+   ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 >
+}
+
+define <4 x i32> @movi4s_msl16() {
+;CHECK:  movi {{v[0-31]+}}.4s, #0xff, msl #16
+   ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 >
+}
+
+define <2 x i32> @mvni2s_msl8() {
+;CHECK:  mvni {{v[0-31]+}}.2s, #0x10, msl #8
+   ret <2 x i32> < i32 18446744073709547264, i32 18446744073709547264>
+}
+
+define <2 x i32> @mvni2s_msl16() {
+;CHECK:  mvni {{v[0-31]+}}.2s, #0x10, msl #16
+   ret <2 x i32> < i32 18446744073708437504, i32 18446744073708437504>
+}
+
+define <4 x i32> @mvni4s_msl8() {
+;CHECK:  mvni {{v[0-31]+}}.4s, #0x10, msl #8
+   ret <4 x i32> < i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264>
+}
+
+define <4 x i32> @mvni4s_msl16() {
+;CHECK:  mvni {{v[0-31]+}}.4s, #0x10, msl #16
+   ret <4 x i32> < i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504>
+}
+
+define <2 x i64> @movi2d() {
+;CHECK: movi {{v[0-31]+}}.2d, #0xff0000ff0000ffff
+	ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
+}
+
+define <1 x i64> @movid() {
+;CHECK: movi {{d[0-31]+}}, #0xff0000ff0000ffff
+	ret  <1 x i64> < i64 18374687574888349695 >
+}
+
+define <2 x float> @fmov2s() {
+;CHECK:  fmov {{v[0-31]+}}.2s, #-12.00000000
+	ret <2 x float> < float -1.2e1, float -1.2e1>
+}
+
+define <4 x float> @fmov4s() {
+;CHECK:  fmov {{v[0-31]+}}.4s, #-12.00000000
+	ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1>
+}
+
+define <2 x double> @fmov2d() {
+;CHECK:  fmov {{v[0-31]+}}.2d, #-12.00000000
+	ret <2 x double> < double -1.2e1, double -1.2e1>
+}
+
+
diff --git a/llvm/test/CodeGen/AArch64/neon-mul-div.ll b/llvm/test/CodeGen/AArch64/neon-mul-div.ll
new file mode 100644
index 0000000..e1be313
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-mul-div.ll
@@ -0,0 +1,181 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: mul {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+	%tmp3 = mul <8 x i8> %A, %B;
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: mul {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+	%tmp3 = mul <16 x i8> %A, %B;
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: mul {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+	%tmp3 = mul <4 x i16> %A, %B;
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: mul {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+	%tmp3 = mul <8 x i16> %A, %B;
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: mul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp3 = mul <2 x i32> %A, %B;
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: mul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp3 = mul <4 x i32> %A, %B;
+	ret <4 x i32> %tmp3
+}
+
+ define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fmul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp3 = fmul <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fmul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp3 = fmul <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fmul {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+	%tmp3 = fmul <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+
+ define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fdiv {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+	%tmp3 = fdiv <2 x float> %A, %B;
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fdiv {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+	%tmp3 = fdiv <4 x float> %A, %B;
+	ret <4 x float> %tmp3
+}
+define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fdiv {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+	%tmp3 = fdiv <2 x double> %A, %B;
+	ret <2 x double> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>)
+
+define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: poly_mulv8i8:
+   %prod = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: pmul v0.8b, v0.8b, v1.8b
+   ret <8 x i8> %prod
+}
+
+define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: poly_mulv16i8:
+   %prod = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: pmul v0.16b, v0.16b, v1.16b
+   ret <16 x i8> %prod
+}
+
+declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqdmulh_v4i16:
+   %prod = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
+   ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqdmulh_v8i16:
+   %prod = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
+   ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqdmulh_v2i32:
+   %prod = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
+   ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqdmulh_v4i32:
+   %prod = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
+   ret <4 x i32> %prod
+}
+
+declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqrdmulh_v4i16:
+   %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
+   ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqrdmulh_v8i16:
+   %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
+   ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqrdmulh_v2i32:
+   %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
+   ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqrdmulh_v4i32:
+   %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
+   ret <4 x i32> %prod
+}
+
+declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2s, v0.2s, v1.2s
+        %val = call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+        ret <2 x float> %val
+}
+
+define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.4s, v0.4s, v1.4s
+        %val = call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+        ret <4 x float> %val
+}
+
+define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2d, v0.2d, v1.2d
+        %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+        ret <2 x double> %val
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-rounding-halving-add.ll b/llvm/test/CodeGen/AArch64/neon-rounding-halving-add.ll
new file mode 100644
index 0000000..009da3b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-rounding-halving-add.ll
@@ -0,0 +1,105 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_urhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_urhadd_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: urhadd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_srhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_srhadd_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: srhadd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_urhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_urhadd_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: urhadd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_srhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_srhadd_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: srhadd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_urhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_urhadd_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: urhadd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_srhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_srhadd_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: srhadd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_urhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_urhadd_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: urhadd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_srhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_srhadd_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: srhadd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_urhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_urhadd_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: urhadd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_srhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_srhadd_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: srhadd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_urhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_urhadd_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: urhadd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_srhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_srhadd_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: srhadd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+
diff --git a/llvm/test/CodeGen/AArch64/neon-rounding-shift.ll b/llvm/test/CodeGen/AArch64/neon-rounding-shift.ll
new file mode 100644
index 0000000..404e491
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-rounding-shift.ll
@@ -0,0 +1,138 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_urshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_urshl_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: urshl v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_srshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_srshl_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: srshl v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_urshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_urshl_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: urshl v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_srshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_srshl_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: srshl v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_urshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_urshl_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: urshl v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_srshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_srshl_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: srshl v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_urshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_urshl_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: urshl v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_srshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_srshl_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: srshl v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_urshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_urshl_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: urshl v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_srshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_srshl_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: srshl v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_urshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_urshl_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: urshl v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_srshl_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: srshl v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_urshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: urshl d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_srshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: srshl d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_urshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_urshl_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: urshl v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_srshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_srshl_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: srshl v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/llvm/test/CodeGen/AArch64/neon-saturating-add-sub.ll
new file mode 100644
index 0000000..b2fac1f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-saturating-add-sub.ll
@@ -0,0 +1,274 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uqadd_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uqadd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_sqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sqadd_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sqadd v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uqadd_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uqadd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_sqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sqadd_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sqadd v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uqadd_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uqadd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_sqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqadd_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqadd v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uqadd_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uqadd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_sqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqadd_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqadd v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uqadd_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uqadd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_sqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqadd_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqadd v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uqadd_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uqadd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqadd_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqadd v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqadd_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: uqadd d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqadd_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sqadd d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_uqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_uqadd_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: uqadd v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_sqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_sqadd_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: sqadd v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
+declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uqsub_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uqsub v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_sqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sqsub_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sqsub v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uqsub_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uqsub v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_sqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sqsub_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sqsub v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uqsub_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uqsub v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_sqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqsub_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqsub v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uqsub_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uqsub v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_sqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqsub_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqsub v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uqsub_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uqsub v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_sqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqsub_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqsub v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uqsub_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uqsub v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_sqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqsub_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqsub v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_uqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_uqsub_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: uqsub v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_sqsub_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: sqsub v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqsub_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: uqsub d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqsub_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sqsub d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/llvm/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
new file mode 100644
index 0000000..05d8dfe
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
@@ -0,0 +1,138 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uqrshl_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uqrshl v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_sqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sqrshl_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sqrshl v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uqrshl_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uqrshl v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_sqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sqrshl_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sqrshl v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uqrshl_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uqrshl v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_sqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqrshl_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqrshl v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uqrshl_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uqrshl v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_sqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqrshl_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqrshl v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uqrshl_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uqrshl v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_sqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqrshl_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqrshl v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uqrshl_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uqrshl v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqrshl_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqrshl v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqrshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: uqrshl d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqrshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sqrshl d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_uqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_uqrshl_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: uqrshl v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_sqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_sqrshl_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: sqrshl v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-saturating-shift.ll b/llvm/test/CodeGen/AArch64/neon-saturating-shift.ll
new file mode 100644
index 0000000..3b7f78c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-saturating-shift.ll
@@ -0,0 +1,138 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uqshl_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uqshl v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sqshl_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sqshl v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uqshl_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uqshl v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_sqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sqshl_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sqshl v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uqshl_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uqshl v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_sqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqshl_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqshl v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uqshl_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uqshl v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_sqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqshl_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqshl v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uqshl_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uqshl v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_sqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqshl_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqshl v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uqshl_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uqshl v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqshl_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqshl v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: uqshl d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sqshl d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_uqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_uqshl_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: uqshl v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_sqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_sqshl_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: sqshl v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/neon-shift.ll b/llvm/test/CodeGen/AArch64/neon-shift.ll
new file mode 100644
index 0000000..45a2605
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-shift.ll
@@ -0,0 +1,140 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uqshl_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: ushl v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sqshl_v8i8:
+  %tmp1 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sshl v0.8b, v0.8b, v1.8b
+  ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_ushl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_ushl_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: ushl v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_sshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sshl_v16i8:
+  %tmp1 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sshl v0.16b, v0.16b, v1.16b
+  ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_ushl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_ushl_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: ushl v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_sshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sshl_v4i16:
+  %tmp1 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sshl v0.4h, v0.4h, v1.4h
+  ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_ushl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_ushl_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: ushl v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_sshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sshl_v8i16:
+  %tmp1 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sshl v0.8h, v0.8h, v1.8h
+  ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_ushl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_ushl_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: ushl v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_sshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sshl_v2i32:
+  %tmp1 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sshl v0.2s, v0.2s, v1.2s
+  ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_ushl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_ushl_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: ushl v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sshl_v4i32:
+  %tmp1 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sshl v0.4s, v0.4s, v1.4s
+  ret <4 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_ushl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: ushl d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sshl_v1i64:
+  %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sshl d0, d0, d1
+  ret <1 x i64> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_ushl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_ushl_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: ushl v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_sshl_v2i64:
+  %tmp1 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: sshl v0.2d, v0.2d, v1.2d
+  ret <2 x i64> %tmp1
+}
+
+
+
diff --git a/llvm/test/MC/AArch64/basic-a64-diagnostics.s b/llvm/test/MC/AArch64/basic-a64-diagnostics.s
index 1e9024c..2e6e0bb 100644
--- a/llvm/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/llvm/test/MC/AArch64/basic-a64-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 < %s 2> %t
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
 
 //------------------------------------------------------------------------------
@@ -2892,13 +2892,13 @@
         movi wzr, #0x44444444
         movi w3, #0xffff
         movi x9, #0x0000ffff00000000
-// CHECK-ERROR: error: invalid instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         movi wzr, #0x44444444
 // CHECK-ERROR-NEXT:         ^
-// CHECK-ERROR: error: invalid instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         movi w3, #0xffff
 // CHECK-ERROR-NEXT:         ^
-// CHECK-ERROR: error: invalid instruction
+// CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR-NEXT:         movi x9, #0x0000ffff00000000
 // CHECK-ERROR-NEXT:         ^
 
diff --git a/llvm/test/MC/AArch64/basic-a64-instructions.s b/llvm/test/MC/AArch64/basic-a64-instructions.s
index ad3064e..e4f6b21 100644
--- a/llvm/test/MC/AArch64/basic-a64-instructions.s
+++ b/llvm/test/MC/AArch64/basic-a64-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
   .globl _func
 
 // Check that the assembler can handle the documented syntax from the ARM ARM.
diff --git a/llvm/test/MC/AArch64/neon-aba-abd.s b/llvm/test/MC/AArch64/neon-aba-abd.s
new file mode 100644
index 0000000..178eb26
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-aba-abd.s
@@ -0,0 +1,78 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference and Accumulate (Signed, Unsigned)
+//----------------------------------------------------------------------
+         uaba v0.8b, v1.8b, v2.8b
+         uaba v0.16b, v1.16b, v2.16b
+         uaba v0.4h, v1.4h, v2.4h
+         uaba v0.8h, v1.8h, v2.8h
+         uaba v0.2s, v1.2s, v2.2s
+         uaba v0.4s, v1.4s, v2.4s
+
+// CHECK: uaba v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x7c,0x22,0x2e]
+// CHECK: uaba v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x7c,0x22,0x6e]
+// CHECK: uaba v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x7c,0x62,0x2e]
+// CHECK: uaba v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x7c,0x62,0x6e]
+// CHECK: uaba v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x7c,0xa2,0x2e]
+// CHECK: uaba v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x7c,0xa2,0x6e]
+
+
+         saba v0.8b, v1.8b, v2.8b
+         saba v0.16b, v1.16b, v2.16b
+         saba v0.4h, v1.4h, v2.4h
+         saba v0.8h, v1.8h, v2.8h
+         saba v0.2s, v1.2s, v2.2s
+         saba v0.4s, v1.4s, v2.4s
+
+// CHECK: saba v0.8b, v1.8b, v2.8b         // encoding: [0x20,0x7c,0x22,0x0e]
+// CHECK: saba v0.16b, v1.16b, v2.16b      // encoding: [0x20,0x7c,0x22,0x4e]
+// CHECK: saba v0.4h, v1.4h, v2.4h         // encoding: [0x20,0x7c,0x62,0x0e]
+// CHECK: saba v0.8h, v1.8h, v2.8h         // encoding: [0x20,0x7c,0x62,0x4e]
+// CHECK: saba v0.2s, v1.2s, v2.2s         // encoding: [0x20,0x7c,0xa2,0x0e]
+// CHECK: saba v0.4s, v1.4s, v2.4s         // encoding: [0x20,0x7c,0xa2,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference (Signed, Unsigned)
+//----------------------------------------------------------------------
+         uabd v0.8b, v1.8b, v2.8b
+         uabd v0.16b, v1.16b, v2.16b
+         uabd v0.4h, v1.4h, v2.4h
+         uabd v0.8h, v1.8h, v2.8h
+         uabd v0.2s, v1.2s, v2.2s
+         uabd v0.4s, v1.4s, v2.4s
+
+// CHECK: uabd v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x74,0x22,0x2e]
+// CHECK: uabd v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x74,0x22,0x6e]
+// CHECK: uabd v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x74,0x62,0x2e]
+// CHECK: uabd v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x74,0x62,0x6e]
+// CHECK: uabd v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x74,0xa2,0x2e]
+// CHECK: uabd v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x74,0xa2,0x6e]
+
+         sabd v0.8b, v1.8b, v2.8b
+         sabd v0.16b, v1.16b, v2.16b
+         sabd v0.4h, v1.4h, v2.4h
+         sabd v0.8h, v1.8h, v2.8h
+         sabd v0.2s, v1.2s, v2.2s
+         sabd v0.4s, v1.4s, v2.4s
+
+// CHECK: sabd v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x74,0x22,0x0e]
+// CHECK: sabd v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x74,0x22,0x4e]
+// CHECK: sabd v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x74,0x62,0x0e]
+// CHECK: sabd v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x74,0x62,0x4e]
+// CHECK: sabd v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x74,0xa2,0x0e]
+// CHECK: sabd v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x74,0xa2,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference (Floating Point)
+//----------------------------------------------------------------------
+         fabd v0.2s, v1.2s, v2.2s
+         fabd v31.4s, v15.4s, v16.4s
+         fabd v7.2d, v8.2d, v25.2d
+
+// CHECK: fabd v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xd4,0xa2,0x2e]
+// CHECK: fabd v31.4s, v15.4s, v16.4s // encoding: [0xff,0xd5,0xb0,0x6e]
+// CHECK: fabd v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xd5,0xf9,0x6e]
+
diff --git a/llvm/test/MC/AArch64/neon-add-pairwise.s b/llvm/test/MC/AArch64/neon-add-pairwise.s
new file mode 100644
index 0000000..b586c22
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-add-pairwise.s
@@ -0,0 +1,35 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector  Add Pairwise (Integer)
+//------------------------------------------------------------------------------
+         addp v0.8b, v1.8b, v2.8b
+         addp v0.16b, v1.16b, v2.16b
+         addp v0.4h, v1.4h, v2.4h
+         addp v0.8h, v1.8h, v2.8h
+         addp v0.2s, v1.2s, v2.2s
+         addp v0.4s, v1.4s, v2.4s
+         addp v0.2d, v1.2d, v2.2d
+
+// CHECK: addp v0.8b, v1.8b, v2.8b        // encoding: [0x20,0xbc,0x22,0x0e]
+// CHECK: addp v0.16b, v1.16b, v2.16b     // encoding: [0x20,0xbc,0x22,0x4e]
+// CHECK: addp v0.4h, v1.4h, v2.4h        // encoding: [0x20,0xbc,0x62,0x0e]
+// CHECK: addp v0.8h, v1.8h, v2.8h        // encoding: [0x20,0xbc,0x62,0x4e]
+// CHECK: addp v0.2s, v1.2s, v2.2s        // encoding: [0x20,0xbc,0xa2,0x0e]
+// CHECK: addp v0.4s, v1.4s, v2.4s        // encoding: [0x20,0xbc,0xa2,0x4e]
+// CHECK: addp v0.2d, v1.2d, v2.2d        // encoding: [0x20,0xbc,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Add Pairwise (Floating Point
+//------------------------------------------------------------------------------
+         faddp v0.2s, v1.2s, v2.2s
+         faddp v0.4s, v1.4s, v2.4s
+         faddp v0.2d, v1.2d, v2.2d
+
+// CHECK: faddp v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xd4,0x22,0x2e]
+// CHECK: faddp v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0x22,0x6e]
+// CHECK: faddp v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0x62,0x6e]
+
diff --git a/llvm/test/MC/AArch64/neon-add-sub-instructions.s b/llvm/test/MC/AArch64/neon-add-sub-instructions.s
new file mode 100644
index 0000000..863798e
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-add-sub-instructions.s
@@ -0,0 +1,82 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Add
+//------------------------------------------------------------------------------
+         add v0.8b, v1.8b, v2.8b
+         add v0.16b, v1.16b, v2.16b
+         add v0.4h, v1.4h, v2.4h
+         add v0.8h, v1.8h, v2.8h
+         add v0.2s, v1.2s, v2.2s
+         add v0.4s, v1.4s, v2.4s
+         add v0.2d, v1.2d, v2.2d
+
+// CHECK: add v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x84,0x22,0x0e]
+// CHECK: add v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x84,0x22,0x4e]
+// CHECK: add v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x84,0x62,0x0e]
+// CHECK: add v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x84,0x62,0x4e]
+// CHECK: add v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x84,0xa2,0x0e]
+// CHECK: add v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x84,0xa2,0x4e]
+// CHECK: add v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x84,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Sub
+//------------------------------------------------------------------------------
+         sub v0.8b, v1.8b, v2.8b
+         sub v0.16b, v1.16b, v2.16b
+         sub v0.4h, v1.4h, v2.4h
+         sub v0.8h, v1.8h, v2.8h
+         sub v0.2s, v1.2s, v2.2s
+         sub v0.4s, v1.4s, v2.4s
+         sub v0.2d, v1.2d, v2.2d
+
+// CHECK: sub v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x84,0x22,0x2e]
+// CHECK: sub v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x84,0x22,0x6e]
+// CHECK: sub v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x84,0x62,0x2e]
+// CHECK: sub v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x84,0x62,0x6e]
+// CHECK: sub v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x84,0xa2,0x2e]
+// CHECK: sub v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x84,0xa2,0x6e]
+// CHECK: sub v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x84,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Vector Floating-Point Add
+//------------------------------------------------------------------------------
+         fadd v0.2s, v1.2s, v2.2s
+         fadd v0.4s, v1.4s, v2.4s
+         fadd v0.2d, v1.2d, v2.2d
+
+// CHECK: fadd v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xd4,0x22,0x0e]
+// CHECK: fadd v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0x22,0x4e]
+// CHECK: fadd v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0x62,0x4e]
+
+
+//------------------------------------------------------------------------------
+// Vector Floating-Point Sub
+//------------------------------------------------------------------------------
+         fsub v0.2s, v1.2s, v2.2s
+         fsub v0.4s, v1.4s, v2.4s
+         fsub v0.2d, v1.2d, v2.2d
+
+// CHECK: fsub v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xd4,0xa2,0x0e]
+// CHECK: fsub v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xd4,0xa2,0x4e]
+// CHECK: fsub v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xd4,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Add
+//------------------------------------------------------------------------------
+         add d31, d0, d16
+
+// CHECK: add d31, d0, d16       // encoding: [0x1f,0x84,0xf0,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Sub
+//------------------------------------------------------------------------------
+         sub d1, d7, d8
+
+// CHECK: sub d1, d7, d8       // encoding: [0xe1,0x84,0xe8,0x7e]
+
+
+
diff --git a/llvm/test/MC/AArch64/neon-bitwise-instructions.s b/llvm/test/MC/AArch64/neon-bitwise-instructions.s
new file mode 100644
index 0000000..79d0a9b
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-bitwise-instructions.s
@@ -0,0 +1,60 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Vector And
+//------------------------------------------------------------------------------
+         and v0.8b, v1.8b, v2.8b
+         and v0.16b, v1.16b, v2.16b
+
+// CHECK: and v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x1c,0x22,0x0e]
+// CHECK: and v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x1c,0x22,0x4e]
+
+
+//------------------------------------------------------------------------------
+// Vector Orr
+//------------------------------------------------------------------------------
+         orr v0.8b, v1.8b, v2.8b
+         orr v0.16b, v1.16b, v2.16b
+
+// CHECK: orr v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x1c,0xa2,0x0e]
+// CHECK: orr v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x1c,0xa2,0x4e]
+
+
+//------------------------------------------------------------------------------
+// Vector Eor
+//------------------------------------------------------------------------------
+         eor v0.8b, v1.8b, v2.8b
+         eor v0.16b, v1.16b, v2.16b
+
+// CHECK: eor v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x1c,0x22,0x2e]
+// CHECK: eor v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x1c,0x22,0x6e]
+
+
+//----------------------------------------------------------------------
+// Vector Bitwise
+//----------------------------------------------------------------------
+
+         bit v0.8b, v1.8b, v2.8b
+         bit v0.16b, v1.16b, v2.16b
+         bif v0.8b, v1.8b, v2.8b
+         bif v0.16b, v1.16b, v2.16b
+         bsl v0.8b, v1.8b, v2.8b
+         bsl v0.16b, v1.16b, v2.16b
+         orn v0.8b, v1.8b, v2.8b
+         orn v0.16b, v1.16b, v2.16b
+         bic v0.8b, v1.8b, v2.8b
+         bic v0.16b, v1.16b, v2.16b
+
+// CHECK: bit v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x1c,0xa2,0x2e]
+// CHECK: bit v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x1c,0xa2,0x6e]
+// CHECK: bif v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x1c,0xe2,0x2e]
+// CHECK: bif v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x1c,0xe2,0x6e]
+// CHECK: bsl v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x1c,0x62,0x2e]
+// CHECK: bsl v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x1c,0x62,0x6e]
+// CHECK: orn v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x1c,0xe2,0x0e]
+// CHECK: orn v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x1c,0xe2,0x4e]
+// CHECK: bic v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x1c,0x62,0x0e]
+// CHECK: bic v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x1c,0x62,0x4e]
+
diff --git a/llvm/test/MC/AArch64/neon-compare-instructions.s b/llvm/test/MC/AArch64/neon-compare-instructions.s
new file mode 100644
index 0000000..e4bc202
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-compare-instructions.s
@@ -0,0 +1,405 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal (Integer)
+//----------------------------------------------------------------------
+
+         cmeq v0.8b, v15.8b, v17.8b
+         cmeq v1.16b, v31.16b, v8.16b
+         cmeq v15.4h, v16.4h, v17.4h
+         cmeq v5.8h, v6.8h, v7.8h
+         cmeq v29.2s, v27.2s, v28.2s
+         cmeq v9.4s, v7.4s, v8.4s
+         cmeq v3.2d, v31.2d, v21.2d
+
+// CHECK: cmeq v0.8b, v15.8b, v17.8b    // encoding: [0xe0,0x8d,0x31,0x2e]
+// CHECK: cmeq v1.16b, v31.16b, v8.16b  // encoding: [0xe1,0x8f,0x28,0x6e]
+// CHECK: cmeq v15.4h, v16.4h, v17.4h   // encoding: [0x0f,0x8e,0x71,0x2e]
+// CHECK: cmeq v5.8h, v6.8h, v7.8h      // encoding: [0xc5,0x8c,0x67,0x6e]
+// CHECK: cmeq v29.2s, v27.2s, v28.2s   // encoding: [0x7d,0x8f,0xbc,0x2e]
+// CHECK: cmeq v9.4s, v7.4s, v8.4s      // encoding: [0xe9,0x8c,0xa8,0x6e]
+// CHECK: cmeq v3.2d, v31.2d, v21.2d    // encoding: [0xe3,0x8f,0xf5,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Higher or Same (Unsigned Integer)
+// Vector Compare Mask Less or Same (Unsigned Integer)
+// CMLS is alias for CMHS with operands reversed.
+//----------------------------------------------------------------------
+
+         cmhs v0.8b, v15.8b, v17.8b
+         cmhs v1.16b, v31.16b, v8.16b
+         cmhs v15.4h, v16.4h, v17.4h
+         cmhs v5.8h, v6.8h, v7.8h
+         cmhs v29.2s, v27.2s, v28.2s
+         cmhs v9.4s, v7.4s, v8.4s
+         cmhs v3.2d, v31.2d, v21.2d
+
+         cmls v0.8b, v17.8b, v15.8b
+         cmls v1.16b, v8.16b, v31.16b
+         cmls v15.4h, v17.4h, v16.4h
+         cmls v5.8h, v7.8h, v6.8h
+         cmls v29.2s, v28.2s, v27.2s
+         cmls v9.4s, v8.4s, v7.4s
+         cmls v3.2d, v21.2d, v31.2d
+
+// CHECK: cmhs v0.8b, v15.8b, v17.8b   // encoding: [0xe0,0x3d,0x31,0x2e]
+// CHECK: cmhs v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x3f,0x28,0x6e]
+// CHECK: cmhs v15.4h, v16.4h, v17.4h  // encoding: [0x0f,0x3e,0x71,0x2e]
+// CHECK: cmhs v5.8h, v6.8h, v7.8h     // encoding: [0xc5,0x3c,0x67,0x6e]
+// CHECK: cmhs v29.2s, v27.2s, v28.2s  // encoding: [0x7d,0x3f,0xbc,0x2e]
+// CHECK: cmhs v9.4s, v7.4s, v8.4s     // encoding: [0xe9,0x3c,0xa8,0x6e]
+// CHECK: cmhs v3.2d, v31.2d, v21.2d   // encoding: [0xe3,0x3f,0xf5,0x6e]
+// CHECK: cmhs v0.8b, v15.8b, v17.8b   // encoding: [0xe0,0x3d,0x31,0x2e]
+// CHECK: cmhs v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x3f,0x28,0x6e]
+// CHECK: cmhs v15.4h, v16.4h, v17.4h  // encoding: [0x0f,0x3e,0x71,0x2e]
+// CHECK: cmhs v5.8h, v6.8h, v7.8h     // encoding: [0xc5,0x3c,0x67,0x6e]
+// CHECK: cmhs v29.2s, v27.2s, v28.2s  // encoding: [0x7d,0x3f,0xbc,0x2e]
+// CHECK: cmhs v9.4s, v7.4s, v8.4s     // encoding: [0xe9,0x3c,0xa8,0x6e]
+// CHECK: cmhs v3.2d, v31.2d, v21.2d   // encoding: [0xe3,0x3f,0xf5,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal (Integer)
+// Vector Compare Mask Less Than or Equal (Integer)
+// CMLE is alias for CMGE with operands reversed.
+//----------------------------------------------------------------------
+
+         cmge v0.8b, v15.8b, v17.8b
+         cmge v1.16b, v31.16b, v8.16b
+         cmge v15.4h, v16.4h, v17.4h
+         cmge v5.8h, v6.8h, v7.8h
+         cmge v29.2s, v27.2s, v28.2s
+         cmge v9.4s, v7.4s, v8.4s
+         cmge v3.2d, v31.2d, v21.2d
+
+         cmle v0.8b, v17.8b, v15.8b
+         cmle v1.16b, v8.16b, v31.16b
+         cmle v15.4h, v17.4h, v16.4h
+         cmle v5.8h, v7.8h, v6.8h
+         cmle v29.2s, v28.2s, v27.2s
+         cmle v9.4s, v8.4s, v7.4s
+         cmle v3.2d, v21.2d, v31.2d
+
+// CHECK: cmge v0.8b, v15.8b, v17.8b    // encoding: [0xe0,0x3d,0x31,0x0e]
+// CHECK: cmge v1.16b, v31.16b, v8.16b  // encoding: [0xe1,0x3f,0x28,0x4e]
+// CHECK: cmge v15.4h, v16.4h, v17.4h   // encoding: [0x0f,0x3e,0x71,0x0e]
+// CHECK: cmge v5.8h, v6.8h, v7.8h      // encoding: [0xc5,0x3c,0x67,0x4e]
+// CHECK: cmge v29.2s, v27.2s, v28.2s   // encoding: [0x7d,0x3f,0xbc,0x0e]
+// CHECK: cmge v9.4s, v7.4s, v8.4s      // encoding: [0xe9,0x3c,0xa8,0x4e]
+// CHECK: cmge v3.2d, v31.2d, v21.2d    // encoding: [0xe3,0x3f,0xf5,0x4e]
+// CHECK: cmge v0.8b, v15.8b, v17.8b    // encoding: [0xe0,0x3d,0x31,0x0e]
+// CHECK: cmge v1.16b, v31.16b, v8.16b  // encoding: [0xe1,0x3f,0x28,0x4e]
+// CHECK: cmge v15.4h, v16.4h, v17.4h   // encoding: [0x0f,0x3e,0x71,0x0e]
+// CHECK: cmge v5.8h, v6.8h, v7.8h      // encoding: [0xc5,0x3c,0x67,0x4e]
+// CHECK: cmge v29.2s, v27.2s, v28.2s   // encoding: [0x7d,0x3f,0xbc,0x0e]
+// CHECK: cmge v9.4s, v7.4s, v8.4s      // encoding: [0xe9,0x3c,0xa8,0x4e]
+// CHECK: cmge v3.2d, v31.2d, v21.2d    // encoding: [0xe3,0x3f,0xf5,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Higher (Unsigned Integer)
+// Vector Compare Mask Lower (Unsigned Integer)
+// CMLO is alias for CMHI with operands reversed.
+//----------------------------------------------------------------------
+
+         cmhi v0.8b, v15.8b, v17.8b
+         cmhi v1.16b, v31.16b, v8.16b
+         cmhi v15.4h, v16.4h, v17.4h
+         cmhi v5.8h, v6.8h, v7.8h
+         cmhi v29.2s, v27.2s, v28.2s
+         cmhi v9.4s, v7.4s, v8.4s
+         cmhi v3.2d, v31.2d, v21.2d
+
+         cmlo v0.8b, v17.8b, v15.8b
+         cmlo v1.16b, v8.16b, v31.16b
+         cmlo v15.4h, v17.4h, v16.4h
+         cmlo v5.8h, v7.8h, v6.8h
+         cmlo v29.2s, v28.2s, v27.2s
+         cmlo v9.4s, v8.4s, v7.4s
+         cmlo v3.2d, v21.2d, v31.2d
+
+// CHECK: cmhi v0.8b, v15.8b, v17.8b    // encoding: [0xe0,0x35,0x31,0x2e]
+// CHECK: cmhi v1.16b, v31.16b, v8.16b  // encoding: [0xe1,0x37,0x28,0x6e]
+// CHECK: cmhi v15.4h, v16.4h, v17.4h   // encoding: [0x0f,0x36,0x71,0x2e]
+// CHECK: cmhi v5.8h, v6.8h, v7.8h      // encoding: [0xc5,0x34,0x67,0x6e]
+// CHECK: cmhi v29.2s, v27.2s, v28.2s   // encoding: [0x7d,0x37,0xbc,0x2e]
+// CHECK: cmhi v9.4s, v7.4s, v8.4s      // encoding: [0xe9,0x34,0xa8,0x6e]
+// CHECK: cmhi v3.2d, v31.2d, v21.2d    // encoding: [0xe3,0x37,0xf5,0x6e]
+// CHECK: cmhi v0.8b, v15.8b, v17.8b    // encoding: [0xe0,0x35,0x31,0x2e]
+// CHECK: cmhi v1.16b, v31.16b, v8.16b  // encoding: [0xe1,0x37,0x28,0x6e]
+// CHECK: cmhi v15.4h, v16.4h, v17.4h   // encoding: [0x0f,0x36,0x71,0x2e]
+// CHECK: cmhi v5.8h, v6.8h, v7.8h      // encoding: [0xc5,0x34,0x67,0x6e]
+// CHECK: cmhi v29.2s, v27.2s, v28.2s   // encoding: [0x7d,0x37,0xbc,0x2e]
+// CHECK: cmhi v9.4s, v7.4s, v8.4s      // encoding: [0xe9,0x34,0xa8,0x6e]
+// CHECK: cmhi v3.2d, v31.2d, v21.2d    // encoding: [0xe3,0x37,0xf5,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than (Integer)
+// Vector Compare Mask Less Than (Integer)
+// CMLT is alias for CMGT with operands reversed.
+//----------------------------------------------------------------------
+
+         cmgt v0.8b, v15.8b, v17.8b
+         cmgt v1.16b, v31.16b, v8.16b
+         cmgt v15.4h, v16.4h, v17.4h
+         cmgt v5.8h, v6.8h, v7.8h
+         cmgt v29.2s, v27.2s, v28.2s
+         cmgt v9.4s, v7.4s, v8.4s
+         cmgt v3.2d, v31.2d, v21.2d
+
+         cmlt v0.8b, v17.8b, v15.8b
+         cmlt v1.16b, v8.16b, v31.16b
+         cmlt v15.4h, v17.4h, v16.4h
+         cmlt v5.8h, v7.8h, v6.8h
+         cmlt v29.2s, v28.2s, v27.2s
+         cmlt v9.4s, v8.4s, v7.4s
+         cmlt v3.2d, v21.2d, v31.2d
+
+// CHECK: cmgt v0.8b, v15.8b, v17.8b    // encoding: [0xe0,0x35,0x31,0x0e]
+// CHECK: cmgt v1.16b, v31.16b, v8.16b  // encoding: [0xe1,0x37,0x28,0x4e]
+// CHECK: cmgt v15.4h, v16.4h, v17.4h   // encoding: [0x0f,0x36,0x71,0x0e]
+// CHECK: cmgt v5.8h, v6.8h, v7.8h      // encoding: [0xc5,0x34,0x67,0x4e]
+// CHECK: cmgt v29.2s, v27.2s, v28.2s   // encoding: [0x7d,0x37,0xbc,0x0e]
+// CHECK: cmgt v9.4s, v7.4s, v8.4s      // encoding: [0xe9,0x34,0xa8,0x4e]
+// CHECK: cmgt v3.2d, v31.2d, v21.2d    // encoding: [0xe3,0x37,0xf5,0x4e]
+// CHECK: cmgt v0.8b, v15.8b, v17.8b    // encoding: [0xe0,0x35,0x31,0x0e]
+// CHECK: cmgt v1.16b, v31.16b, v8.16b  // encoding: [0xe1,0x37,0x28,0x4e]
+// CHECK: cmgt v15.4h, v16.4h, v17.4h   // encoding: [0x0f,0x36,0x71,0x0e]
+// CHECK: cmgt v5.8h, v6.8h, v7.8h      // encoding: [0xc5,0x34,0x67,0x4e]
+// CHECK: cmgt v29.2s, v27.2s, v28.2s   // encoding: [0x7d,0x37,0xbc,0x0e]
+// CHECK: cmgt v9.4s, v7.4s, v8.4s      // encoding: [0xe9,0x34,0xa8,0x4e]
+// CHECK: cmgt v3.2d, v31.2d, v21.2d    // encoding: [0xe3,0x37,0xf5,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Bitwise Test (Integer)
+//----------------------------------------------------------------------
+
+         cmtst v0.8b, v15.8b, v17.8b
+         cmtst v1.16b, v31.16b, v8.16b
+         cmtst v15.4h, v16.4h, v17.4h
+         cmtst v5.8h, v6.8h, v7.8h
+         cmtst v29.2s, v27.2s, v28.2s
+         cmtst v9.4s, v7.4s, v8.4s
+         cmtst v3.2d, v31.2d, v21.2d
+
+// CHECK: cmtst v0.8b, v15.8b, v17.8b    // encoding: [0xe0,0x8d,0x31,0x0e]
+// CHECK: cmtst v1.16b, v31.16b, v8.16b  // encoding: [0xe1,0x8f,0x28,0x4e]
+// CHECK: cmtst v15.4h, v16.4h, v17.4h   // encoding: [0x0f,0x8e,0x71,0x0e]
+// CHECK: cmtst v5.8h, v6.8h, v7.8h      // encoding: [0xc5,0x8c,0x67,0x4e]
+// CHECK: cmtst v29.2s, v27.2s, v28.2s   // encoding: [0x7d,0x8f,0xbc,0x0e]
+// CHECK: cmtst v9.4s, v7.4s, v8.4s      // encoding: [0xe9,0x8c,0xa8,0x4e]
+// CHECK: cmtst v3.2d, v31.2d, v21.2d    // encoding: [0xe3,0x8f,0xf5,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal (Floating Point)
+//----------------------------------------------------------------------
+
+         fcmeq v0.2s, v31.2s, v16.2s
+         fcmeq v4.4s, v7.4s, v15.4s
+         fcmeq v29.2d, v2.2d, v5.2d
+
+// CHECK: fcmeq v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xe7,0x30,0x0e]
+// CHECK: fcmeq v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xe4,0x2f,0x4e]
+// CHECK: fcmeq v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xe4,0x65,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Or Equal (Floating Point)
+// Vector Compare Mask Less Than Or Equal (Floating Point)
+// FCMLE is alias for FCMGE with operands reversed.
+//----------------------------------------------------------------------
+
+         fcmge v31.4s, v29.4s, v28.4s
+         fcmge v3.2s, v8.2s, v12.2s
+         fcmge v17.2d, v15.2d, v13.2d
+         fcmle v31.4s, v28.4s, v29.4s
+         fcmle v3.2s,  v12.2s, v8.2s
+         fcmle v17.2d, v13.2d, v15.2d
+
+// CHECK: fcmge v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xe7,0x3c,0x6e]
+// CHECK: fcmge v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xe5,0x2c,0x2e]
+// CHECK: fcmge v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xe5,0x6d,0x6e]
+// CHECK: fcmge v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xe7,0x3c,0x6e]
+// CHECK: fcmge v3.2s,  v8.2s, v12.2s   // encoding: [0x03,0xe5,0x2c,0x2e]
+// CHECK: fcmge v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xe5,0x6d,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than (Floating Point)
+// Vector Compare Mask Less Than (Floating Point)
+// FCMLT is alias for FCMGT with operands reversed.
+//----------------------------------------------------------------------
+
+         fcmgt v0.2s, v31.2s, v16.2s
+         fcmgt v4.4s, v7.4s, v15.4s
+         fcmgt v29.2d, v2.2d, v5.2d
+         fcmlt v0.2s, v16.2s, v31.2s
+         fcmlt v4.4s, v15.4s, v7.4s
+         fcmlt v29.2d, v5.2d, v2.2d
+
+// CHECK: fcmgt v0.2s, v31.2s, v16.2s  // encoding: [0xe0,0xe7,0xb0,0x2e]
+// CHECK: fcmgt v4.4s, v7.4s, v15.4s   // encoding: [0xe4,0xe4,0xaf,0x6e]
+// CHECK: fcmgt v29.2d, v2.2d, v5.2d   // encoding: [0x5d,0xe4,0xe5,0x6e]
+// CHECK: fcmgt v0.2s, v31.2s, v16.2s  // encoding: [0xe0,0xe7,0xb0,0x2e]
+// CHECK: fcmgt v4.4s, v7.4s, v15.4s   // encoding: [0xe4,0xe4,0xaf,0x6e]
+// CHECK: fcmgt v29.2d, v2.2d, v5.2d   // encoding: [0x5d,0xe4,0xe5,0x6e]
+
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal to Zero (Integer)
+//----------------------------------------------------------------------
+
+         cmeq v0.8b, v15.8b, #0
+         cmeq v1.16b, v31.16b, #0
+         cmeq v15.4h, v16.4h, #0
+         cmeq v5.8h, v6.8h, #0
+         cmeq v29.2s, v27.2s, #0
+         cmeq v9.4s, v7.4s, #0
+         cmeq v3.2d, v31.2d, #0
+
+// CHECK: cmeq v0.8b, v15.8b, #0x0    // encoding: [0xe0,0x99,0x20,0x0e]
+// CHECK: cmeq v1.16b, v31.16b, #0x0  // encoding: [0xe1,0x9b,0x20,0x4e]
+// CHECK: cmeq v15.4h, v16.4h, #0x0   // encoding: [0x0f,0x9a,0x60,0x0e]
+// CHECK: cmeq v5.8h, v6.8h, #0x0     // encoding: [0xc5,0x98,0x60,0x4e]
+// CHECK: cmeq v29.2s, v27.2s, #0x0   // encoding: [0x7d,0x9b,0xa0,0x0e]
+// CHECK: cmeq v9.4s, v7.4s, #0x0     // encoding: [0xe9,0x98,0xa0,0x4e]
+// CHECK: cmeq v3.2d, v31.2d, #0x0    // encoding: [0xe3,0x9b,0xe0,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
+//----------------------------------------------------------------------
+         cmge v0.8b, v15.8b, #0
+         cmge v1.16b, v31.16b, #0
+         cmge v15.4h, v16.4h, #0
+         cmge v5.8h, v6.8h, #0
+         cmge v29.2s, v27.2s, #0
+         cmge v17.4s, v20.4s, #0
+         cmge v3.2d, v31.2d, #0
+
+// CHECK: cmge v0.8b, v15.8b, #0x0    // encoding: [0xe0,0x89,0x20,0x2e]
+// CHECK: cmge v1.16b, v31.16b, #0x0  // encoding: [0xe1,0x8b,0x20,0x6e]
+// CHECK: cmge v15.4h, v16.4h, #0x0   // encoding: [0x0f,0x8a,0x60,0x2e]
+// CHECK: cmge v5.8h, v6.8h, #0x0     // encoding: [0xc5,0x88,0x60,0x6e]
+// CHECK: cmge v29.2s, v27.2s, #0x0   // encoding: [0x7d,0x8b,0xa0,0x2e]
+// CHECK: cmge v17.4s, v20.4s, #0x0   // encoding: [0x91,0x8a,0xa0,0x6e]
+// CHECK: cmge v3.2d, v31.2d, #0x0    // encoding: [0xe3,0x8b,0xe0,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Zero (Signed Integer)
+//----------------------------------------------------------------------
+
+         cmgt v0.8b, v15.8b, #0
+         cmgt v1.16b, v31.16b, #0
+         cmgt v15.4h, v16.4h, #0
+         cmgt v5.8h, v6.8h, #0
+         cmgt v29.2s, v27.2s, #0
+         cmgt v9.4s, v7.4s, #0
+         cmgt v3.2d, v31.2d, #0
+
+// CHECK: cmgt v0.8b, v15.8b, #0x0    // encoding: [0xe0,0x89,0x20,0x0e]
+// CHECK: cmgt v1.16b, v31.16b, #0x0  // encoding: [0xe1,0x8b,0x20,0x4e]
+// CHECK: cmgt v15.4h, v16.4h, #0x0   // encoding: [0x0f,0x8a,0x60,0x0e]
+// CHECK: cmgt v5.8h, v6.8h, #0x0     // encoding: [0xc5,0x88,0x60,0x4e]
+// CHECK: cmgt v29.2s, v27.2s, #0x0   // encoding: [0x7d,0x8b,0xa0,0x0e]
+// CHECK: cmgt v9.4s, v7.4s, #0x0     // encoding: [0xe9,0x88,0xa0,0x4e]
+// CHECK: cmgt v3.2d, v31.2d, #0x0    // encoding: [0xe3,0x8b,0xe0,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
+//----------------------------------------------------------------------
+         cmle v0.8b, v15.8b, #0
+         cmle v1.16b, v31.16b, #0
+         cmle v15.4h, v16.4h, #0
+         cmle v5.8h, v6.8h, #0
+         cmle v29.2s, v27.2s, #0
+         cmle v9.4s, v7.4s, #0
+         cmle v3.2d, v31.2d, #0
+
+// CHECK: cmle v0.8b, v15.8b, #0x0    // encoding: [0xe0,0x99,0x20,0x2e]
+// CHECK: cmle v1.16b, v31.16b, #0x0  // encoding: [0xe1,0x9b,0x20,0x6e]
+// CHECK: cmle v15.4h, v16.4h, #0x0   // encoding: [0x0f,0x9a,0x60,0x2e]
+// CHECK: cmle v5.8h, v6.8h, #0x0     // encoding: [0xc5,0x98,0x60,0x6e]
+// CHECK: cmle v29.2s, v27.2s, #0x0   // encoding: [0x7d,0x9b,0xa0,0x2e]
+// CHECK: cmle v9.4s, v7.4s, #0x0     // encoding: [0xe9,0x98,0xa0,0x6e]
+// CHECK: cmle v3.2d, v31.2d, #0x0    // encoding: [0xe3,0x9b,0xe0,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than Zero (Signed Integer)
+//----------------------------------------------------------------------
+         cmlt v0.8b, v15.8b, #0
+         cmlt v1.16b, v31.16b, #0
+         cmlt v15.4h, v16.4h, #0
+         cmlt v5.8h, v6.8h, #0
+         cmlt v29.2s, v27.2s, #0
+         cmlt v9.4s, v7.4s, #0
+         cmlt v3.2d, v31.2d, #0
+
+// CHECK: cmlt v0.8b, v15.8b, #0x0    // encoding: [0xe0,0xa9,0x20,0x0e]
+// CHECK: cmlt v1.16b, v31.16b, #0x0  // encoding: [0xe1,0xab,0x20,0x4e]
+// CHECK: cmlt v15.4h, v16.4h, #0x0   // encoding: [0x0f,0xaa,0x60,0x0e]
+// CHECK: cmlt v5.8h, v6.8h, #0x0     // encoding: [0xc5,0xa8,0x60,0x4e]
+// CHECK: cmlt v29.2s, v27.2s, #0x0   // encoding: [0x7d,0xab,0xa0,0x0e]
+// CHECK: cmlt v9.4s, v7.4s, #0x0     // encoding: [0xe9,0xa8,0xa0,0x4e]
+// CHECK: cmlt v3.2d, v31.2d, #0x0    // encoding: [0xe3,0xab,0xe0,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal to Zero (Floating Point)
+//----------------------------------------------------------------------
+         fcmeq v0.2s, v31.2s, #0.0
+         fcmeq v4.4s, v7.4s, #0.0
+         fcmeq v29.2d, v2.2d, #0.0
+
+// CHECK: fcmeq v0.2s, v31.2s, #0.0  // encoding: [0xe0,0xdb,0xa0,0x0e]
+// CHECK: fcmeq v4.4s, v7.4s, #0.0   // encoding: [0xe4,0xd8,0xa0,0x4e]
+// CHECK: fcmeq v29.2d, v2.2d, #0.0  // encoding: [0x5d,0xd8,0xe0,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
+//----------------------------------------------------------------------
+         fcmge v31.4s, v29.4s, #0.0
+         fcmge v3.2s, v8.2s, #0.0
+         fcmge v17.2d, v15.2d, #0.0
+
+// CHECK: fcmge v31.4s, v29.4s, #0.0  // encoding: [0xbf,0xcb,0xa0,0x6e]
+// CHECK: fcmge v3.2s, v8.2s, #0.0    // encoding: [0x03,0xc9,0xa0,0x2e]
+// CHECK: fcmge v17.2d, v15.2d, #0.0   // encoding: [0xf1,0xc9,0xe0,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Zero (Floating Point)
+//----------------------------------------------------------------------
+         fcmgt v0.2s, v31.2s, #0.0
+         fcmgt v4.4s, v7.4s, #0.0
+         fcmgt v29.2d, v2.2d, #0.0
+
+// CHECK: fcmgt v0.2s, v31.2s, #0.0   // encoding: [0xe0,0xcb,0xa0,0x0e]
+// CHECK: fcmgt v4.4s, v7.4s, #0.0    // encoding: [0xe4,0xc8,0xa0,0x4e]
+// CHECK: fcmgt v29.2d, v2.2d, #0.0   // encoding: [0x5d,0xc8,0xe0,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than or Equal To Zero (Floating Point)
+//----------------------------------------------------------------------
+         fcmle v1.4s, v8.4s, #0.0
+         fcmle v3.2s, v20.2s, #0.0
+         fcmle v7.2d, v13.2d, #0.0
+
+// CHECK: fcmle v1.4s, v8.4s, #0.0   // encoding: [0x01,0xd9,0xa0,0x6e]
+// CHECK: fcmle v3.2s, v20.2s, #0.0  // encoding: [0x83,0xda,0xa0,0x2e]
+// CHECK: fcmle v7.2d, v13.2d, #0.0  // encoding: [0xa7,0xd9,0xe0,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than Zero (Floating Point)
+//----------------------------------------------------------------------
+         fcmlt v16.2s, v2.2s, #0.0
+         fcmlt v15.4s, v4.4s, #0.0
+         fcmlt v5.2d, v29.2d, #0.0
+
+// CHECK: fcmlt v16.2s, v2.2s, #0.0   // encoding: [0x50,0xe8,0xa0,0x0e]
+// CHECK: fcmlt v15.4s, v4.4s, #0.0   // encoding: [0x8f,0xe8,0xa0,0x4e]
+// CHECK: fcmlt v5.2d, v29.2d, #0.0   // encoding: [0xa5,0xeb,0xe0,0x4e]
+
+
+
+
+
+
+
+
+
diff --git a/llvm/test/MC/AArch64/neon-diagnostics.s b/llvm/test/MC/AArch64/neon-diagnostics.s
new file mode 100644
index 0000000..5373889
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-diagnostics.s
@@ -0,0 +1,1207 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+
+//------------------------------------------------------------------------------
+// Vector Integer Add/sub
+//------------------------------------------------------------------------------
+
+        // Mismatched vector types
+        add v0.16b, v1.8b, v2.8b
+        sub v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         add v0.16b, v1.8b, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sub v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                              ^
+
+//------------------------------------------------------------------------------
+// Vector Floating-Point Add/sub
+//------------------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+        fadd v0.2d, v1.2s, v2.2s
+        fsub v0.4s, v1.2s, v2.4s
+        fsub v0.8b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fadd v0.2d, v1.2s, v2.2s
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fsub v0.4s, v1.2s, v2.4s
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fsub v0.8b, v1.8b, v2.8b
+// CHECK-ERROR:                  ^
+
+//----------------------------------------------------------------------
+// Vector Integer Mul
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+        mul v0.16b, v1.8b, v2.8b
+        mul v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         mul v0.16b, v1.8b, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         mul v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                ^
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Mul/Div
+//----------------------------------------------------------------------
+        // Mismatched vector types
+        fmul v0.16b, v1.8b, v2.8b
+        fdiv v0.2s, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fmul v0.16b, v1.8b, v2.8b
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fdiv v0.2s, v1.2d, v2.2d
+// CHECK-ERROR:                        ^
+
+//----------------------------------------------------------------------
+// Vector And Orr Eor Bsl Bit Bif, Orn, Bic,
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        and v0.8b, v1.16b, v2.8b
+        orr v0.4h, v1.4h, v2.4h
+        eor v0.2s, v1.2s, v2.2s
+        bsl v0.8b, v1.16b, v2.8b
+        bsl v0.2s, v1.2s, v2.2s
+        bit v0.2d, v1.2d, v2.2d
+        bif v0.4h, v1.4h, v2.4h
+        orn v0.8b, v1.16b, v2.16b
+        bic v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         and v0.8b, v1.16b, v2.8b
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         orr v0.4h, v1.4h, v2.4h
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         eor v0.2s, v1.2s, v2.2s
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         bsl v0.8b, v1.16b, v2.8b
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         bsl v0.2s, v1.2s, v2.2s
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         bit v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         bif v0.4h, v1.4h, v2.4h
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         orn v0.8b, v1.16b, v2.16b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         bic v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                ^
+
+//----------------------------------------------------------------------
+// Vector Integer Multiply-accumulate and Multiply-subtract
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+        mla v0.16b, v1.8b, v2.8b
+        mls v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         mla v0.16b, v1.8b, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         mls v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                ^
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Multiply-accumulate and Multiply-subtract
+//----------------------------------------------------------------------
+        // Mismatched vector types
+        fmla v0.2s, v1.2d, v2.2d
+        fmls v0.16b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fmla v0.2s, v1.2d, v2.2d
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fmls v0.16b, v1.8b, v2.8b
+// CHECK-ERROR:                         ^
+
+
+//----------------------------------------------------------------------
+// Vector Move Immediate Shifted
+// Vector Move Inverted Immediate Shifted
+// Vector Bitwise Bit Clear (AND NOT) - immediate
+// Vector Bitwise OR - immedidate
+//----------------------------------------------------------------------
+      // out of range immediate (0 to 0xff)
+      movi v0.2s, #-1
+      mvni v1.4s, #256
+      // out of range shift (0, 8, 16, 24 and 0, 8)
+      bic v15.4h, #1, lsl #7
+      orr v31.2s, #1, lsl #25
+      movi v5.4h, #10, lsl #16
+      // invalid vector type (2s, 4s, 4h, 8h)
+      movi v5.8b, #1, lsl #8
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:          movi v0.2s, #-1
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         mvni v1.4s, #256
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         bic v15.4h, #1, lsl #7
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         orr v31.2s, #1, lsl #25
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         movi v5.4h, #10, lsl #16
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         movi v5.8b, #1, lsl #8
+// CHECK-ERROR:                         ^
+//----------------------------------------------------------------------
+// Vector Move Immediate Masked
+// Vector Move Inverted Immediate Masked
+//----------------------------------------------------------------------
+      // out of range immediate (0 to 0xff)
+      movi v0.2s, #-1, msl #8
+      mvni v7.4s, #256, msl #16
+      // out of range shift (8, 16)
+      movi v3.2s, #1, msl #0
+      mvni v17.4s, #255, msl #32
+      // invalid vector type (2s, 4s)
+      movi v5.4h, #31, msl #8
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         movi v0.2s, #-1, msl #8
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         mvni v7.4s, #256, msl #16
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         movi v3.2s, #1, msl #0
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         mvni v17.4s, #255, msl #32
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         movi v5.4h, #31, msl #8
+// CHECK-ERROR:                          ^
+
+//----------------------------------------------------------------------
+// Vector Immediate - per byte
+//----------------------------------------------------------------------
+        // out of range immediate (0 to 0xff)
+        movi v0.8b, #-1
+        movi v1.16b, #256
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         movi v0.8b, #-1
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         movi v1.16b, #256
+// CHECK-ERROR:                      ^
+
+
+//----------------------------------------------------------------------
+// Vector Move Immediate - bytemask, per doubleword
+//---------------------------------------------------------------------
+        // invalid bytemask (0x00 or 0xff)
+        movi v0.2d, #0x10ff00ff00ff00ff
+
+// CHECK:ERROR: error: invalid operand for instruction
+// CHECK:ERROR:         movi v0.2d, #0x10ff00ff00ff00ff
+// CHECK:ERROR:                     ^
+
+//----------------------------------------------------------------------
+// Vector Move Immediate - bytemask, one doubleword
+//----------------------------------------------------------------------
+        // invalid bytemask (0x00 or 0xff)
+        movi v0.2d, #0xffff00ff001f00ff
+
+// CHECK:ERROR: error: invalid operand for instruction
+// CHECK:ERROR:         movi v0.2d, #0xffff00ff001f00ff
+// CHECK:ERROR:                     ^
+//----------------------------------------------------------------------
+// Vector Floating Point Move Immediate
+//----------------------------------------------------------------------
+        // invalid vector type (2s, 4s, 2d)
+         fmov v0.4h, #1.0
+
+// CHECK:ERROR: error: invalid operand for instruction
+// CHECK:ERROR:         fmov v0.4h, #1.0
+// CHECK:ERROR:              ^
+
+//----------------------------------------------------------------------
+// Vector Move -  register
+//----------------------------------------------------------------------
+      // invalid vector type (8b, 16b)
+      mov v0.2s, v31.8b
+// CHECK:ERROR: error: invalid operand for instruction
+// CHECK:ERROR:         mov v0.2s, v31.8b
+// CHECK:ERROR:                ^
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference and Accumulate (Signed, Unsigned)
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types (2d)
+        saba v0.16b, v1.8b, v2.8b
+        uaba v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         saba v0.16b, v1.8b, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uaba v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                ^
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference and Accumulate (Signed, Unsigned)
+// Vector Absolute Difference (Signed, Unsigned)
+
+        // Mismatched and invalid vector types (2d)
+        uaba v0.16b, v1.8b, v2.8b
+        saba v0.2d, v1.2d, v2.2d
+        uabd v0.4s, v1.2s, v2.2s
+        sabd v0.4h, v1.8h, v8.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uaba v0.16b, v1.8b, v2.8b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         saba v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         uabd v0.4s, v1.2s, v2.2s
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sabd v0.4h, v1.8h, v8.8h
+// CHECK-ERROR:                        ^
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        fabd v0.2s, v1.4s, v2.2d
+        fabd v0.4h, v1.4h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fabd v0.2s, v1.4s, v2.2d
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fabd v0.4h, v1.4h, v2.4h
+// CHECK-ERROR:                 ^
+//----------------------------------------------------------------------
+// Vector Multiply (Polynomial)
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+         pmul v0.8b, v1.8b, v2.16b
+         pmul v0.2s, v1.2s, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         pmul v0.8b, v1.8b, v2.16b
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         pmul v0.2s, v1.2s, v2.2s
+// CHECK-ERROR:                 ^
+
+//----------------------------------------------------------------------
+// Scalar Integer Add and Sub
+//----------------------------------------------------------------------
+
+      // Mismatched registers
+         add d0, s1, d2
+         sub s1, d1, d2
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         add d0, s1, d2
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sub s1, d1, d2
+// CHECK-ERROR:             ^
+
+//----------------------------------------------------------------------
+// Vector Reciprocal Step (Floating Point)
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+         frecps v0.4s, v1.2d, v2.4s
+         frecps v0.8h, v1.8h, v2.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        frecps v0.4s, v1.2d, v2.4s
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        frecps v0.8h, v1.8h, v2.8h
+// CHECK-ERROR:                  ^
+
+//----------------------------------------------------------------------
+// Vector Reciprocal Square Root Step (Floating Point)
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+         frsqrts v0.2d, v1.2d, v2.2s
+         frsqrts v0.4h, v1.4h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        frsqrts v0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        frsqrts v0.4h, v1.4h, v2.4h
+// CHECK-ERROR:                   ^
+
+
+//----------------------------------------------------------------------
+// Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+        facge v0.2d, v1.2s, v2.2d
+        facge v0.4h, v1.4h, v2.4h
+        facle v0.8h, v1.4h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        facge v0.2d, v1.2s, v2.2d
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        facge v0.4h, v1.4h, v2.4h
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        facle v0.8h, v1.4h, v2.4h
+// CHECK-ERROR:                 ^
+//----------------------------------------------------------------------
+// Vector Absolute Compare Mask Less Than (Floating Point)
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+        facgt v0.2d, v1.2d, v2.4s
+        facgt v0.8h, v1.8h, v2.8h
+        faclt v0.8b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        facgt v0.2d, v1.2d, v2.4s
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        facgt v0.8h, v1.8h, v2.8h
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        faclt v0.8b, v1.8b, v2.8b
+// CHECK-ERROR:                 ^
+
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal (Integer)
+//----------------------------------------------------------------------
+
+         // Mismatched vector types
+         cmeq c0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        cmeq c0.2d, v1.2d, v2.2s
+// CHECK-ERROR:                              ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Higher or Same (Unsigned Integer)
+// Vector Compare Mask Less or Same (Unsigned Integer)
+// CMLS is alias for CMHS with operands reversed.
+//----------------------------------------------------------------------
+
+         // Mismatched vector types
+         cmhs c0.4h, v1.8b, v2.8b
+         cmls c0.16b, v1.16b, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        cmhs c0.4h, v1.8b, v2.8b
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        cmls c0.16b, v1.16b, v2.2d
+// CHECK-ERROR:                                ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal (Integer)
+// Vector Compare Mask Less Than or Equal (Integer)
+// CMLE is alias for CMGE with operands reversed.
+//----------------------------------------------------------------------
+
+         // Mismatched vector types
+         cmge c0.8h, v1.8b, v2.8b
+         cmle c0.4h, v1.2s, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        cmge c0.8h, v1.8b, v2.8b
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmle c0.4h, v1.2s, v2.2s
+// CHECK-ERROR:                        ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Higher (Unsigned Integer)
+// Vector Compare Mask Lower (Unsigned Integer)
+// CMLO is alias for CMHI with operands reversed.
+//----------------------------------------------------------------------
+
+         // Mismatched vector types
+         cmhi c0.4s, v1.4s, v2.16b
+         cmlo c0.8b, v1.8b, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        cmhi c0.4s, v1.4s, v2.16b
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmlo c0.8b, v1.8b, v2.2s
+// CHECK-ERROR:                               ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than (Integer)
+// Vector Compare Mask Less Than (Integer)
+// CMLT is alias for CMGT with operands reversed.
+//----------------------------------------------------------------------
+
+         // Mismatched vector types
+         cmgt c0.8b, v1.4s, v2.16b
+         cmlt c0.8h, v1.16b, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmgt c0.8b, v1.4s, v2.16b
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmlt c0.8h, v1.16b, v2.4s
+// CHECK-ERROR:                        ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Bitwise Test (Integer)
+//----------------------------------------------------------------------
+
+         // Mismatched vector types
+         cmtst c0.16b, v1.16b, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmtst c0.16b, v1.16b, v2.4s
+// CHECK-ERROR:                                  ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal (Floating Point)
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+        fcmeq v0.2d, v1.2s, v2.2d
+        fcmeq v0.16b, v1.16b, v2.16b
+        fcmeq v0.8b, v1.4h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmeq v0.2d, v1.2s, v2.2d
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmeq v0.16b, v1.16b, v2.16b
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmeq v0.8b, v1.4h, v2.4h
+// CHECK-ERROR:                 ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Or Equal (Floating Point)
+// Vector Compare Mask Less Than Or Equal (Floating Point)
+// FCMLE is alias for FCMGE with operands reversed.
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+         fcmge v31.4s, v29.2s, v28.4s
+         fcmge v3.8b, v8.2s, v12.2s
+         fcmle v17.8h, v15.2d, v13.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmge v31.4s, v29.2s, v28.4s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmge v3.8b, v8.2s, v12.2s
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmle v17.8h, v15.2d, v13.2d
+// CHECK-ERROR:                 ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than (Floating Point)
+// Vector Compare Mask Less Than (Floating Point)
+// FCMLT is alias for FCMGT with operands reversed.
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types
+         fcmgt v0.2d, v31.2s, v16.2s
+         fcmgt v4.4s, v7.4s, v15.4h
+         fcmlt v29.2d, v5.2d, v2.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmgt v0.2d, v31.2s, v16.2s
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR:        fcmgt v4.4s, v7.4s, v15.4h
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, v2.16b
+// CHECK-ERROR:                                ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal to Zero (Integer)
+//----------------------------------------------------------------------
+        // Mismatched vector types and invalid imm
+         // Mismatched vector types
+         cmeq c0.2d, v1.2s, #0
+         cmeq c0.2d, v1.2d, #1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        cmeq c0.2d, v1.2s, #0
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        cmeq c0.2d, v1.2d, #1
+// CHECK-ERROR:                            ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
+//----------------------------------------------------------------------
+        // Mismatched vector types and invalid imm
+         cmge c0.8h, v1.8b, #0
+         cmge c0.4s, v1.4s, #-1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        cmge c0.8h, v1.8b, #0
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmge c0.4s, v1.4s, #-1
+// CHECK-ERROR:                             ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Zero (Signed Integer)
+//----------------------------------------------------------------------
+        // Mismatched vector types and invalid imm
+         cmgt c0.8b, v1.4s, #0
+         cmgt c0.8b, v1.8b, #-255
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmgt c0.8b, v1.4s, #0
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmgt c0.8b, v1.8b, #-255
+// CHECK-ERROR:                             ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
+//----------------------------------------------------------------------
+        // Mismatched vector types and invalid imm
+         cmle c0.4h, v1.2s, #0
+         cmle c0.16b, v1.16b, #16
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        cmle c0.4h, v1.2s, #0
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmle c0.16b, v1.16b, #16
+// CHECK-ERROR:                               ^
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than Zero (Signed Integer)
+//----------------------------------------------------------------------
+        // Mismatched vector types and invalid imm
+         cmlt c0.8h, v1.16b, #0
+         cmlt c0.8h, v1.8h, #-15
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmlt c0.8h, v1.16b, #0
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         cmlt c0.8h, v1.8h, #-15
+// CHECK-ERROR:                             ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal to Zero (Floating Point)
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types, invalid imm
+        fcmeq v0.2d, v1.2s, #0.0
+        fcmeq v0.16b, v1.16b, #0.0
+        fcmeq v0.8b, v1.4h, #1.0
+        fcmeq v0.8b, v1.4h, #1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmeq v0.2d, v1.2s, #0.0
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmeq v0.16b, v1.16b, #0.0
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmeq v0.8b, v1.4h, #1.0
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error:  Expected floating-point immediate
+// CHECK-ERROR:        fcmeq v0.8b, v1.4h, #1
+// CHECK-ERROR:                             ^
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types, invalid imm
+         fcmge v31.4s, v29.2s, #0.0
+         fcmge v3.8b, v8.2s, #0.0
+         fcmle v17.8h, v15.2d, #-1.0
+         fcmle v17.8h, v15.2d, #0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmge v31.4s, v29.2s, #0.0
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmge v3.8b, v8.2s, #0.0
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmle v17.8h, v15.2d, #-1.0
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error:  Expected floating-point immediate
+// CHECK-ERROR:        fcmle v17.8h, v15.2d, #0
+// CHECK-ERROR:                               ^
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Zero (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types, invalid imm
+         fcmgt v0.2d, v31.2s, #0.0
+         fcmgt v4.4s, v7.4h, #0.0
+         fcmlt v29.2d, v5.2d, #255.0
+         fcmlt v29.2d, v5.2d, #255
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmgt v0.2d, v31.2s, #0.0
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmgt v4.4s, v7.4h, #0.0
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #255.0
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error:  Expected floating-point immediate
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #255
+// CHECK-ERROR:                              ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than or Equal To Zero (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types, invalid imm
+         fcmge v31.4s, v29.2s, #0.0
+         fcmge v3.8b, v8.2s, #0.0
+         fcmle v17.2d, v15.2d, #15.0
+         fcmle v17.2d, v15.2d, #15
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmge v31.4s, v29.2s, #0.0
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmge v3.8b, v8.2s, #0.0
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR:        fcmle v17.2d, v15.2d, #15.0
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error:  Expected floating-point immediate
+// CHECK-ERROR:        fcmle v17.2d, v15.2d, #15
+// CHECK-ERROR:                              ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than Zero (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types, invalid imm
+         fcmgt v0.2d, v31.2s, #0.0
+         fcmgt v4.4s, v7.4h, #0.0
+         fcmlt v29.2d, v5.2d, #16.0
+         fcmlt v29.2d, v5.2d, #2
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmgt v0.2d, v31.2s, #0.0
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fcmgt v4.4s, v7.4h, #0.0
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #16.0
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error:  Expected floating-point immediate
+// CHECK-ERROR:        fcmlt v29.2d, v5.2d, #2
+// CHECK-ERROR:                              ^
+
+/-----------------------------------------------------------------------
+// Vector Integer Halving Add (Signed)
+// Vector Integer Halving Add (Unsigned)
+// Vector Integer Halving Sub (Signed)
+// Vector Integer Halving Sub (Unsigned)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types (2d)
+        shadd v0.2d, v1.2d, v2.2d
+        uhadd v4.2s, v5.2s, v5.4h
+        shsub v11.4h, v12.8h, v13.4h
+        uhsub v31.16b, v29.8b, v28.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        shadd v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uhadd v4.2s, v5.2s, v5.4h
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        shsub v11.4h, v12.8h, v13.4h
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uhsub v31.16b, v29.8b, v28.8b
+// CHECK-ERROR:                          ^
+
+//----------------------------------------------------------------------
+// Vector Integer Rouding Halving Add (Signed)
+// Vector Integer Rouding Halving Add (Unsigned)
+//----------------------------------------------------------------------
+
+        // Mismatched and invalid vector types (2d)
+        srhadd v0.2s, v1.2s, v2.2d
+        urhadd v0.16b, v1.16b, v2.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        srhadd v0.2s, v1.2s, v2.2d
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        urhadd v0.16b, v1.16b, v2.8h
+// CHECK-ERROR:                                  ^
+
+//----------------------------------------------------------------------
+// Vector Integer Saturating Add (Signed)
+// Vector Integer Saturating Add (Unsigned)
+// Vector Integer Saturating Sub (Signed)
+// Vector Integer Saturating Sub (Unsigned)
+//----------------------------------------------------------------------
+
+        // Mismatched vector types
+        sqadd v0.2s, v1.2s, v2.2d
+        uqadd v31.8h, v1.4h, v2.4h
+        sqsub v10.8h, v1.16b, v2.16b
+        uqsub v31.8b, v1.8b, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqadd v0.2s, v1.2s, v2.2d
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqadd v31.8h, v1.4h, v2.4h
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqsub v10.8h, v1.16b, v2.16b
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqsub v31.8b, v1.8b, v2.4s
+// CHECK-ERROR:                                ^
+
+//----------------------------------------------------------------------
+// Scalar Integer Saturating Add (Signed)
+// Scalar Integer Saturating Add (Unsigned)
+// Scalar Integer Saturating Sub (Signed)
+// Scalar Integer Saturating Sub (Unsigned)
+//----------------------------------------------------------------------
+
+      // Mismatched registers
+         sqadd d0, s31, d2
+         uqadd s0, s1, d2
+         sqsub b0, b2, s18
+         uqsub h1, h2, d2
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqadd d0, s31, d2
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqadd s0, s1, d2
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqsub b0, b2, s18
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqsub h1, h2, d2
+// CHECK-ERROR:                      ^
+
+
+//----------------------------------------------------------------------
+// Vector Shift Left (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+        // Mismatched vector types
+        sshl v0.4s, v15.2s, v16.2s
+        ushl v1.16b, v25.16b, v6.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sshl v0.4s, v15.2s, v16.2s
+// CHECK-ERROR:                        ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ushl v1.16b, v25.16b, v6.8h
+// CHECK-ERROR:                                 ^
+
+//----------------------------------------------------------------------
+// Vector Saturating Shift Left (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+        // Mismatched vector types
+        sqshl v0.2s, v15.2s, v16.2d
+        uqshl v1.8b, v25.4h, v6.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqshl v0.2s, v15.2s, v16.2d
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqshl v1.8b, v25.4h, v6.8h
+// CHECK-ERROR:                         ^
+
+//----------------------------------------------------------------------
+// Vector Rouding Shift Left (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+        // Mismatched vector types
+        srshl v0.8h, v15.8h, v16.16b
+        urshl v1.2d, v25.2d, v6.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        srshl v0.8h, v15.8h, v16.16b
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        urshl v1.2d, v25.2d, v6.4s
+// CHECK-ERROR:                                ^
+
+//----------------------------------------------------------------------
+// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+        // Mismatched vector types
+        sqrshl v0.2s, v15.8h, v16.16b
+        uqrshl v1.4h, v25.4h,  v6.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqrshl v0.2s, v15.8h, v16.16b
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqrshl v1.4h, v25.4h,  v6.2d
+// CHECK-ERROR:                                  ^
+
+//----------------------------------------------------------------------
+// Scalar Integer Shift Left (Signed, Unsigned)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        sshl d0, d1, s2
+        ushl b2, b0, b1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sshl d0, d1, s2
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        ushl b2, b0, b1
+// CHECK-ERROR:             ^
+
+//----------------------------------------------------------------------
+// Scalar Integer Saturating Shift Left (Signed, Unsigned)
+//----------------------------------------------------------------------
+
+        // Mismatched vector types
+        sqshl b0, b1, s0
+        uqshl h0, h1, b0
+        sqshl s0, s1, h0
+        uqshl d0, d1, b0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqshl b0, b1, s0
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqshl h0, h1, b0
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqshl s0, s1, h0
+// CHECK-ERROR:                      ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqshl d0, d1, b0
+// CHECK-ERROR:                      ^
+
+//----------------------------------------------------------------------
+// Scalar Integer Rouding Shift Left (Signed, Unsigned)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        srshl h0, h1, h2
+        urshl s0, s1, s2
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        srshl h0, h1, h2
+// CHECK-ERROR:              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        urshl s0, s1, s2
+// CHECK-ERROR:              ^
+
+
+//----------------------------------------------------------------------
+// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+//----------------------------------------------------------------------
+
+        // Mismatched vector types
+        sqrshl b0, b1, s0
+        uqrshl h0, h1, b0
+        sqrshl s0, s1, h0
+        uqrshl d0, d1, b0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqrshl b0, b1, s0
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqrshl h0, h1, b0
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sqrshl s0, s1, h0
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uqrshl d0, d1, b0
+// CHECK-ERROR:                       ^
+
+
+//----------------------------------------------------------------------
+// Vector Maximum (Signed, Unsigned)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        smax v0.2d, v1.2d, v2.2d
+        umax v0.4h, v1.4h, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smax v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umax v0.4h, v1.4h, v2.2s
+// CHECK-ERROR:                              ^
+
+//----------------------------------------------------------------------
+// Vector Minimum (Signed, Unsigned)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        smin v0.2d, v1.2d, v2.2d
+        umin v0.2s, v1.2s, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smin v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umin v0.2s, v1.2s, v2.8b
+// CHECK-ERROR:                             ^
+
+
+//----------------------------------------------------------------------
+// Vector Maximum (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        fmax v0.2s, v1.2s, v2.4s
+        fmax v0.8b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmax v0.2s, v1.2s, v2.4s
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmax v0.8b, v1.8b, v2.8b
+// CHECK-ERROR:                ^
+//----------------------------------------------------------------------
+// Vector Minimum (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        fmin v0.4s, v1.4s, v2.2d
+        fmin v0.8h, v1.8h, v2.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmin v0.4s, v1.4s, v2.2d
+// CHECK-ERROR:                              ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmin v0.8h, v1.8h, v2.8h
+// CHECK-ERROR:                ^
+
+//----------------------------------------------------------------------
+// Vector maxNum (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        fmaxnm v0.2s, v1.2s, v2.2d
+        fmaxnm v0.4h, v1.8h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxnm v0.2s, v1.2s, v2.2d
+// CHECK-ERROR:                                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxnm v0.4h, v1.8h, v2.4h
+// CHECK-ERROR:                  ^
+
+//----------------------------------------------------------------------
+// Vector minNum (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        fminnm v0.4s, v1.2s, v2.4s
+        fminnm v0.16b, v0.16b, v0.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminnm v0.4s, v1.2s, v2.4s
+// CHECK-ERROR:                         ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminnm v0.16b, v0.16b, v0.16b
+// CHECK-ERROR:                  ^
+
+
+//----------------------------------------------------------------------
+// Vector Maximum Pairwise (Signed, Unsigned)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        smaxp v0.2d, v1.2d, v2.2d
+        umaxp v0.4h, v1.4h, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        smaxp v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        umaxp v0.4h, v1.4h, v2.2s
+// CHECK-ERROR:                               ^
+
+//----------------------------------------------------------------------
+// Vector Minimum Pairwise (Signed, Unsigned)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        sminp v0.2d, v1.2d, v2.2d
+        uminp v0.2s, v1.2s, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        sminp v0.2d, v1.2d, v2.2d
+// CHECK-ERROR:                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        uminp v0.2s, v1.2s, v2.8b
+// CHECK-ERROR:                               ^
+
+
+//----------------------------------------------------------------------
+// Vector Maximum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        fmaxp v0.2s, v1.2s, v2.4s
+        fmaxp v0.8b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxp v0.2s, v1.2s, v2.4s
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxp v0.8b, v1.8b, v2.8b
+// CHECK-ERROR:                 ^
+//----------------------------------------------------------------------
+// Vector Minimum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        fminp v0.4s, v1.4s, v2.2d
+        fminp v0.8h, v1.8h, v2.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminp v0.4s, v1.4s, v2.2d
+// CHECK-ERROR:                               ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminp v0.8h, v1.8h, v2.8h
+// CHECK-ERROR:                 ^
+
+//----------------------------------------------------------------------
+// Vector maxNum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        fmaxnmp v0.2s, v1.2s, v2.2d
+        fmaxnmp v0.4h, v1.8h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxnmp v0.2s, v1.2s, v2.2d
+// CHECK-ERROR:                                 ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fmaxnmp v0.4h, v1.8h, v2.4h
+// CHECK-ERROR:                   ^
+
+//----------------------------------------------------------------------
+// Vector minNum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        fminnmp v0.4s, v1.2s, v2.4s
+        fminnmp v0.16b, v0.16b, v0.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminnmp v0.4s, v1.2s, v2.4s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:        fminnmp v0.16b, v0.16b, v0.16b
+// CHECK-ERROR:                   ^
+
+
+//----------------------------------------------------------------------
+// Vector Add Pairwise (Integer)
+//----------------------------------------------------------------------
+
+        // Mismatched vector types
+        addp v0.16b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         addp v0.16b, v1.8b, v2.8b
+// CHECK-ERROR:                         ^
+
+//----------------------------------------------------------------------
+// Vector Add Pairwise (Floating Point)
+//----------------------------------------------------------------------
+        // Mismatched and invalid vector types
+        faddp v0.16b, v1.8b, v2.8b
+        faddp v0.2d, v1.2d, v2.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         faddp v0.16b, v1.8b, v2.8b
+// CHECK-ERROR:                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         faddp v0.2d, v1.2d, v2.8h
+// CHECK-ERROR:                                ^
+
+
+//----------------------------------------------------------------------
+// Vector Saturating Doubling Multiply High
+//----------------------------------------------------------------------
+         // Mismatched and invalid vector types
+         sqdmulh v2.4h, v25.8h, v3.4h
+         sqdmulh v12.2d, v5.2d, v13.2d
+         sqdmulh v3.8b, v1.8b, v30.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqdmulh v2.4h, v25.8h, v3.4h
+// CHECK-ERROR:                            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqdmulh v12.2d, v5.2d, v13.2d
+// CHECK-ERROR:                     ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqdmulh v3.8b, v1.8b, v30.8b
+// CHECK-ERROR:                    ^
+
+//----------------------------------------------------------------------
+// Vector Saturating Rouding Doubling Multiply High
+//----------------------------------------------------------------------
+         // Mismatched and invalid vector types
+         sqrdmulh v2.2s, v25.4s, v3.4s
+         sqrdmulh v12.16b, v5.16b, v13.16b
+         sqrdmulh v3.4h, v1.4h, v30.2d
+
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqrdmulh v2.2s, v25.4s, v3.4s
+// CHECK-ERROR:                             ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqrdmulh v12.16b, v5.16b, v13.16b
+// CHECK-ERROR:                       ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         sqrdmulh v3.4h, v1.4h, v30.2d
+// CHECK-ERROR:                                    ^
+
+//----------------------------------------------------------------------
+// Vector Multiply Extended
+//----------------------------------------------------------------------
+         // Mismatched and invalid vector types
+      fmulx v21.2s, v5.2s, v13.2d
+      fmulx v1.4h, v25.4h, v3.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fmulx v21.2s, v5.2s, v13.2d
+// CHECK-ERROR:                                  ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:         fmulx v1.4h, v25.4h, v3.4h
+// CHECK-ERROR:                  ^
diff --git a/llvm/test/MC/AArch64/neon-facge-facgt.s b/llvm/test/MC/AArch64/neon-facge-facgt.s
new file mode 100644
index 0000000..212eda2
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-facge-facgt.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
+// FACLE is alias for FACGE with operands reversed
+//----------------------------------------------------------------------
+         facge v0.2s, v31.2s, v16.2s
+         facge v4.4s, v7.4s, v15.4s
+         facge v29.2d, v2.2d, v5.2d
+         facle v0.2s, v16.2s, v31.2s
+         facle v4.4s, v15.4s, v7.4s
+         facle v29.2d, v5.2d, v2.2d
+
+// CHECK: facge v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xef,0x30,0x2e]
+// CHECK: facge v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xec,0x2f,0x6e]
+// CHECK: facge v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xec,0x65,0x6e]
+// CHECK: facge v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xef,0x30,0x2e]
+// CHECK: facge v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xec,0x2f,0x6e]
+// CHECK: facge v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xec,0x65,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Absolute Compare Mask Less Than (Floating Point)
+// FACLT is alias for FACGT with operands reversed
+//----------------------------------------------------------------------
+         facgt v31.4s, v29.4s, v28.4s
+         facgt v3.2s, v8.2s, v12.2s
+         facgt v17.2d, v15.2d, v13.2d
+         faclt v31.4s, v28.4s, v29.4s
+         faclt v3.2s,  v12.2s, v8.2s
+         faclt v17.2d, v13.2d, v15.2d
+
+// CHECK: facgt v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xef,0xbc,0x6e]
+// CHECK: facgt v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xed,0xac,0x2e]
+// CHECK: facgt v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xed,0xed,0x6e]
+// CHECK: facgt v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xef,0xbc,0x6e]
+// CHECK: facgt v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xed,0xac,0x2e]
+// CHECK: facgt v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xed,0xed,0x6e]
+
+
diff --git a/llvm/test/MC/AArch64/neon-frsqrt-frecp.s b/llvm/test/MC/AArch64/neon-frsqrt-frecp.s
new file mode 100644
index 0000000..79fe5da
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-frsqrt-frecp.s
@@ -0,0 +1,27 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Reciprocal Square Root Step (Floating Point)
+//----------------------------------------------------------------------
+         frsqrts v0.2s, v31.2s, v16.2s
+         frsqrts v4.4s, v7.4s, v15.4s
+         frsqrts v29.2d, v2.2d, v5.2d
+
+// CHECK: frsqrts v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xff,0xb0,0x0e]
+// CHECK: frsqrts v4.4s, v7.4s, v15.4s  // encoding: [0xe4,0xfc,0xaf,0x4e]
+// CHECK: frsqrts v29.2d, v2.2d, v5.2d  // encoding: [0x5d,0xfc,0xe5,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Reciprocal Step (Floating Point)
+//----------------------------------------------------------------------
+         frecps v31.4s, v29.4s, v28.4s
+         frecps v3.2s, v8.2s, v12.2s
+         frecps v17.2d, v15.2d, v13.2d
+
+// CHECK: frecps v31.4s, v29.4s, v28.4s  // encoding: [0xbf,0xff,0x3c,0x4e]
+// CHECK: frecps v3.2s, v8.2s, v12.2s    // encoding: [0x03,0xfd,0x2c,0x0e]
+// CHECK: frecps v17.2d, v15.2d, v13.2d  // encoding: [0xf1,0xfd,0x6d,0x4e]
+
+
diff --git a/llvm/test/MC/AArch64/neon-halving-add-sub.s b/llvm/test/MC/AArch64/neon-halving-add-sub.s
new file mode 100644
index 0000000..555f1b8
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-halving-add-sub.s
@@ -0,0 +1,74 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Halving Add (Signed)
+//------------------------------------------------------------------------------
+         shadd v0.8b, v1.8b, v2.8b
+         shadd v0.16b, v1.16b, v2.16b
+         shadd v0.4h, v1.4h, v2.4h
+         shadd v0.8h, v1.8h, v2.8h
+         shadd v0.2s, v1.2s, v2.2s
+         shadd v0.4s, v1.4s, v2.4s
+
+// CHECK: shadd v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x04,0x22,0x0e]
+// CHECK: shadd v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x04,0x22,0x4e]
+// CHECK: shadd v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x04,0x62,0x0e]
+// CHECK: shadd v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x04,0x62,0x4e]
+// CHECK: shadd v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x04,0xa2,0x0e]
+// CHECK: shadd v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x04,0xa2,0x4e]
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Halving Add (Unsigned)
+//------------------------------------------------------------------------------
+         uhadd v0.8b, v1.8b, v2.8b
+         uhadd v0.16b, v1.16b, v2.16b
+         uhadd v0.4h, v1.4h, v2.4h
+         uhadd v0.8h, v1.8h, v2.8h
+         uhadd v0.2s, v1.2s, v2.2s
+         uhadd v0.4s, v1.4s, v2.4s
+
+// CHECK: uhadd v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x04,0x22,0x2e]
+// CHECK: uhadd v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x04,0x22,0x6e]
+// CHECK: uhadd v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x04,0x62,0x2e]
+// CHECK: uhadd v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x04,0x62,0x6e]
+// CHECK: uhadd v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x04,0xa2,0x2e]
+// CHECK: uhadd v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x04,0xa2,0x6e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Halving Sub (Signed)
+//------------------------------------------------------------------------------
+         shsub v0.8b, v1.8b, v2.8b
+         shsub v0.16b, v1.16b, v2.16b
+         shsub v0.4h, v1.4h, v2.4h
+         shsub v0.8h, v1.8h, v2.8h
+         shsub v0.2s, v1.2s, v2.2s
+         shsub v0.4s, v1.4s, v2.4s
+
+// CHECK: shsub v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x24,0x22,0x0e]
+// CHECK: shsub v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x24,0x22,0x4e]
+// CHECK: shsub v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x24,0x62,0x0e]
+// CHECK: shsub v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x24,0x62,0x4e]
+// CHECK: shsub v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x24,0xa2,0x0e]
+// CHECK: shsub v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x24,0xa2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Halving Sub (Unsigned)
+//------------------------------------------------------------------------------
+         uhsub v0.8b, v1.8b, v2.8b
+         uhsub v0.16b, v1.16b, v2.16b
+         uhsub v0.4h, v1.4h, v2.4h
+         uhsub v0.8h, v1.8h, v2.8h
+         uhsub v0.2s, v1.2s, v2.2s
+         uhsub v0.4s, v1.4s, v2.4s
+
+// CHECK: uhsub v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x24,0x22,0x2e]
+// CHECK: uhsub v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x24,0x22,0x6e]
+// CHECK: uhsub v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x24,0x62,0x2e]
+// CHECK: uhsub v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x24,0x62,0x6e]
+// CHECK: uhsub v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x24,0xa2,0x2e]
+// CHECK: uhsub v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x24,0xa2,0x6e]
+
diff --git a/llvm/test/MC/AArch64/neon-max-min-pairwise.s b/llvm/test/MC/AArch64/neon-max-min-pairwise.s
new file mode 100644
index 0000000..8d2dadb
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-max-min-pairwise.s
@@ -0,0 +1,110 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Maximum Pairwise (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+         smaxp v0.8b, v1.8b, v2.8b
+         smaxp v0.16b, v1.16b, v2.16b
+         smaxp v0.4h, v1.4h, v2.4h
+         smaxp v0.8h, v1.8h, v2.8h
+         smaxp v0.2s, v1.2s, v2.2s
+         smaxp v0.4s, v1.4s, v2.4s
+
+// CHECK: smaxp v0.8b, v1.8b, v2.8b        // encoding: [0x20,0xa4,0x22,0x0e]
+// CHECK: smaxp v0.16b, v1.16b, v2.16b     // encoding: [0x20,0xa4,0x22,0x4e]
+// CHECK: smaxp v0.4h, v1.4h, v2.4h        // encoding: [0x20,0xa4,0x62,0x0e]
+// CHECK: smaxp v0.8h, v1.8h, v2.8h        // encoding: [0x20,0xa4,0x62,0x4e]
+// CHECK: smaxp v0.2s, v1.2s, v2.2s        // encoding: [0x20,0xa4,0xa2,0x0e]
+// CHECK: smaxp v0.4s, v1.4s, v2.4s        // encoding: [0x20,0xa4,0xa2,0x4e]
+
+         umaxp v0.8b, v1.8b, v2.8b
+         umaxp v0.16b, v1.16b, v2.16b
+         umaxp v0.4h, v1.4h, v2.4h
+         umaxp v0.8h, v1.8h, v2.8h
+         umaxp v0.2s, v1.2s, v2.2s
+         umaxp v0.4s, v1.4s, v2.4s
+
+// CHECK: umaxp v0.8b, v1.8b, v2.8b         // encoding: [0x20,0xa4,0x22,0x2e]
+// CHECK: umaxp v0.16b, v1.16b, v2.16b      // encoding: [0x20,0xa4,0x22,0x6e]
+// CHECK: umaxp v0.4h, v1.4h, v2.4h         // encoding: [0x20,0xa4,0x62,0x2e]
+// CHECK: umaxp v0.8h, v1.8h, v2.8h         // encoding: [0x20,0xa4,0x62,0x6e]
+// CHECK: umaxp v0.2s, v1.2s, v2.2s         // encoding: [0x20,0xa4,0xa2,0x2e]
+// CHECK: umaxp v0.4s, v1.4s, v2.4s         // encoding: [0x20,0xa4,0xa2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Minimum Pairwise (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+         sminp v0.8b, v1.8b, v2.8b
+         sminp v0.16b, v1.16b, v2.16b
+         sminp v0.4h, v1.4h, v2.4h
+         sminp v0.8h, v1.8h, v2.8h
+         sminp v0.2s, v1.2s, v2.2s
+         sminp v0.4s, v1.4s, v2.4s
+
+// CHECK: sminp v0.8b, v1.8b, v2.8b        // encoding: [0x20,0xac,0x22,0x0e]
+// CHECK: sminp v0.16b, v1.16b, v2.16b     // encoding: [0x20,0xac,0x22,0x4e]
+// CHECK: sminp v0.4h, v1.4h, v2.4h        // encoding: [0x20,0xac,0x62,0x0e]
+// CHECK: sminp v0.8h, v1.8h, v2.8h        // encoding: [0x20,0xac,0x62,0x4e]
+// CHECK: sminp v0.2s, v1.2s, v2.2s        // encoding: [0x20,0xac,0xa2,0x0e]
+// CHECK: sminp v0.4s, v1.4s, v2.4s        // encoding: [0x20,0xac,0xa2,0x4e]
+
+         uminp v0.8b, v1.8b, v2.8b
+         uminp v0.16b, v1.16b, v2.16b
+         uminp v0.4h, v1.4h, v2.4h
+         uminp v0.8h, v1.8h, v2.8h
+         uminp v0.2s, v1.2s, v2.2s
+         uminp v0.4s, v1.4s, v2.4s
+
+// CHECK: uminp v0.8b, v1.8b, v2.8b         // encoding: [0x20,0xac,0x22,0x2e]
+// CHECK: uminp v0.16b, v1.16b, v2.16b      // encoding: [0x20,0xac,0x22,0x6e]
+// CHECK: uminp v0.4h, v1.4h, v2.4h         // encoding: [0x20,0xac,0x62,0x2e]
+// CHECK: uminp v0.8h, v1.8h, v2.8h         // encoding: [0x20,0xac,0x62,0x6e]
+// CHECK: uminp v0.2s, v1.2s, v2.2s         // encoding: [0x20,0xac,0xa2,0x2e]
+// CHECK: uminp v0.4s, v1.4s, v2.4s         // encoding: [0x20,0xac,0xa2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Maximum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+         fmaxp v0.2s, v1.2s, v2.2s
+         fmaxp v31.4s, v15.4s, v16.4s
+         fmaxp v7.2d, v8.2d, v25.2d
+
+// CHECK: fmaxp v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xf4,0x22,0x2e]
+// CHECK: fmaxp v31.4s, v15.4s, v16.4s // encoding: [0xff,0xf5,0x30,0x6e]
+// CHECK: fmaxp v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xf5,0x79,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Minimum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+         fminp v10.2s, v15.2s, v22.2s
+         fminp v3.4s, v5.4s, v6.4s
+         fminp v17.2d, v13.2d, v2.2d
+
+// CHECK: fminp v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xf5,0xb6,0x2e]
+// CHECK: fminp v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xf4,0xa6,0x6e]
+// CHECK: fminp v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xf5,0xe2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector maxNum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+         fmaxnmp v0.2s, v1.2s, v2.2s
+         fmaxnmp v31.4s, v15.4s, v16.4s
+         fmaxnmp v7.2d, v8.2d, v25.2d
+
+// CHECK: fmaxnmp v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xc4,0x22,0x2e]
+// CHECK: fmaxnmp v31.4s, v15.4s, v16.4s // encoding: [0xff,0xc5,0x30,0x6e]
+// CHECK: fmaxnmp v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xc5,0x79,0x6e]
+
+//----------------------------------------------------------------------
+// Vector minNum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+         fminnmp v10.2s, v15.2s, v22.2s
+         fminnmp v3.4s, v5.4s, v6.4s
+         fminnmp v17.2d, v13.2d, v2.2d
+
+// CHECK: fminnmp v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xc5,0xb6,0x2e]
+// CHECK: fminnmp v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xc4,0xa6,0x6e]
+// CHECK: fminnmp v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xc5,0xe2,0x6e]
+
diff --git a/llvm/test/MC/AArch64/neon-max-min.s b/llvm/test/MC/AArch64/neon-max-min.s
new file mode 100644
index 0000000..6d1efde
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-max-min.s
@@ -0,0 +1,110 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Maximum (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+         smax v0.8b, v1.8b, v2.8b
+         smax v0.16b, v1.16b, v2.16b
+         smax v0.4h, v1.4h, v2.4h
+         smax v0.8h, v1.8h, v2.8h
+         smax v0.2s, v1.2s, v2.2s
+         smax v0.4s, v1.4s, v2.4s
+
+// CHECK: smax v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x64,0x22,0x0e]
+// CHECK: smax v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x64,0x22,0x4e]
+// CHECK: smax v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x64,0x62,0x0e]
+// CHECK: smax v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x64,0x62,0x4e]
+// CHECK: smax v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x64,0xa2,0x0e]
+// CHECK: smax v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x64,0xa2,0x4e]
+
+         umax v0.8b, v1.8b, v2.8b
+         umax v0.16b, v1.16b, v2.16b
+         umax v0.4h, v1.4h, v2.4h
+         umax v0.8h, v1.8h, v2.8h
+         umax v0.2s, v1.2s, v2.2s
+         umax v0.4s, v1.4s, v2.4s
+
+// CHECK: umax v0.8b, v1.8b, v2.8b         // encoding: [0x20,0x64,0x22,0x2e]
+// CHECK: umax v0.16b, v1.16b, v2.16b      // encoding: [0x20,0x64,0x22,0x6e]
+// CHECK: umax v0.4h, v1.4h, v2.4h         // encoding: [0x20,0x64,0x62,0x2e]
+// CHECK: umax v0.8h, v1.8h, v2.8h         // encoding: [0x20,0x64,0x62,0x6e]
+// CHECK: umax v0.2s, v1.2s, v2.2s         // encoding: [0x20,0x64,0xa2,0x2e]
+// CHECK: umax v0.4s, v1.4s, v2.4s         // encoding: [0x20,0x64,0xa2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Minimum (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+         smin v0.8b, v1.8b, v2.8b
+         smin v0.16b, v1.16b, v2.16b
+         smin v0.4h, v1.4h, v2.4h
+         smin v0.8h, v1.8h, v2.8h
+         smin v0.2s, v1.2s, v2.2s
+         smin v0.4s, v1.4s, v2.4s
+
+// CHECK: smin v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x6c,0x22,0x0e]
+// CHECK: smin v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x6c,0x22,0x4e]
+// CHECK: smin v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x6c,0x62,0x0e]
+// CHECK: smin v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x6c,0x62,0x4e]
+// CHECK: smin v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x6c,0xa2,0x0e]
+// CHECK: smin v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x6c,0xa2,0x4e]
+
+         umin v0.8b, v1.8b, v2.8b
+         umin v0.16b, v1.16b, v2.16b
+         umin v0.4h, v1.4h, v2.4h
+         umin v0.8h, v1.8h, v2.8h
+         umin v0.2s, v1.2s, v2.2s
+         umin v0.4s, v1.4s, v2.4s
+
+// CHECK: umin v0.8b, v1.8b, v2.8b         // encoding: [0x20,0x6c,0x22,0x2e]
+// CHECK: umin v0.16b, v1.16b, v2.16b      // encoding: [0x20,0x6c,0x22,0x6e]
+// CHECK: umin v0.4h, v1.4h, v2.4h         // encoding: [0x20,0x6c,0x62,0x2e]
+// CHECK: umin v0.8h, v1.8h, v2.8h         // encoding: [0x20,0x6c,0x62,0x6e]
+// CHECK: umin v0.2s, v1.2s, v2.2s         // encoding: [0x20,0x6c,0xa2,0x2e]
+// CHECK: umin v0.4s, v1.4s, v2.4s         // encoding: [0x20,0x6c,0xa2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Maximum (Floating Point)
+//----------------------------------------------------------------------
+         fmax v0.2s, v1.2s, v2.2s
+         fmax v31.4s, v15.4s, v16.4s
+         fmax v7.2d, v8.2d, v25.2d
+
+// CHECK: fmax v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xf4,0x22,0x0e]
+// CHECK: fmax v31.4s, v15.4s, v16.4s // encoding: [0xff,0xf5,0x30,0x4e]
+// CHECK: fmax v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xf5,0x79,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Minimum (Floating Point)
+//----------------------------------------------------------------------
+         fmin v10.2s, v15.2s, v22.2s
+         fmin v3.4s, v5.4s, v6.4s
+         fmin v17.2d, v13.2d, v2.2d
+
+// CHECK: fmin v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xf5,0xb6,0x0e]
+// CHECK: fmin v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xf4,0xa6,0x4e]
+// CHECK: fmin v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xf5,0xe2,0x4e]
+
+//----------------------------------------------------------------------
+// Vector maxNum (Floating Point)
+//----------------------------------------------------------------------
+         fmaxnm v0.2s, v1.2s, v2.2s
+         fmaxnm v31.4s, v15.4s, v16.4s
+         fmaxnm v7.2d, v8.2d, v25.2d
+
+// CHECK: fmaxnm v0.2s, v1.2s, v2.2s    // encoding: [0x20,0xc4,0x22,0x0e]
+// CHECK: fmaxnm v31.4s, v15.4s, v16.4s // encoding: [0xff,0xc5,0x30,0x4e]
+// CHECK: fmaxnm v7.2d, v8.2d, v25.2d   // encoding: [0x07,0xc5,0x79,0x4e]
+
+//----------------------------------------------------------------------
+// Vector minNum (Floating Point)
+//----------------------------------------------------------------------
+         fminnm v10.2s, v15.2s, v22.2s
+         fminnm v3.4s, v5.4s, v6.4s
+         fminnm v17.2d, v13.2d, v2.2d
+
+// CHECK: fminnm v10.2s, v15.2s, v22.2s  // encoding: [0xea,0xc5,0xb6,0x0e]
+// CHECK: fminnm v3.4s, v5.4s, v6.4s     // encoding: [0xa3,0xc4,0xa6,0x4e]
+// CHECK: fminnm v17.2d, v13.2d, v2.2d   // encoding: [0xb1,0xc5,0xe2,0x4e]
+
diff --git a/llvm/test/MC/AArch64/neon-mla-mls-instructions.s b/llvm/test/MC/AArch64/neon-mla-mls-instructions.s
new file mode 100644
index 0000000..3072e6f
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-mla-mls-instructions.s
@@ -0,0 +1,61 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Integer Multiply-accumulate
+//----------------------------------------------------------------------
+         mla v0.8b, v1.8b, v2.8b
+         mla v0.16b, v1.16b, v2.16b
+         mla v0.4h, v1.4h, v2.4h
+         mla v0.8h, v1.8h, v2.8h
+         mla v0.2s, v1.2s, v2.2s
+         mla v0.4s, v1.4s, v2.4s
+
+// CHECK: mla v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x94,0x22,0x0e]
+// CHECK: mla v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x94,0x22,0x4e]
+// CHECK: mla v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x94,0x62,0x0e]
+// CHECK: mla v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x94,0x62,0x4e]
+// CHECK: mla v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x94,0xa2,0x0e]
+// CHECK: mla v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x94,0xa2,0x4e]
+
+
+//----------------------------------------------------------------------
+// Vector Integer Multiply-subtract
+//----------------------------------------------------------------------
+         mls v0.8b, v1.8b, v2.8b
+         mls v0.16b, v1.16b, v2.16b
+         mls v0.4h, v1.4h, v2.4h
+         mls v0.8h, v1.8h, v2.8h
+         mls v0.2s, v1.2s, v2.2s
+         mls v0.4s, v1.4s, v2.4s
+
+// CHECK: mls v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x94,0x22,0x2e]
+// CHECK: mls v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x94,0x22,0x6e]
+// CHECK: mls v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x94,0x62,0x2e]
+// CHECK: mls v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x94,0x62,0x6e]
+// CHECK: mls v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x94,0xa2,0x2e]
+// CHECK: mls v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x94,0xa2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Multiply-accumulate
+//----------------------------------------------------------------------
+         fmla v0.2s, v1.2s, v2.2s
+         fmla v0.4s, v1.4s, v2.4s
+         fmla v0.2d, v1.2d, v2.2d
+
+// CHECK: fmla v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xcc,0x22,0x0e]
+// CHECK: fmla v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xcc,0x22,0x4e]
+// CHECK: fmla v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xcc,0x62,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Multiply-subtract
+//----------------------------------------------------------------------
+         fmls v0.2s, v1.2s, v2.2s
+         fmls v0.4s, v1.4s, v2.4s
+         fmls v0.2d, v1.2d, v2.2d
+
+// CHECK: fmls v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xcc,0xa2,0x0e]
+// CHECK: fmls v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xcc,0xa2,0x4e]
+// CHECK: fmls v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xcc,0xe2,0x4e]
+
diff --git a/llvm/test/MC/AArch64/neon-mov.s b/llvm/test/MC/AArch64/neon-mov.s
new file mode 100644
index 0000000..8331372
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-mov.s
@@ -0,0 +1,207 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//----------------------------------------------------------------------
+// Vector Move Immediate Shifted
+//----------------------------------------------------------------------
+         movi v0.2s, #1
+         movi v1.2s, #0
+         movi v15.2s, #1, lsl #8
+         movi v16.2s, #1, lsl #16
+         movi v31.2s, #1, lsl #24
+         movi v0.4s, #1
+         movi v0.4s, #1, lsl #8
+         movi v0.4s, #1, lsl #16
+         movi v0.4s, #1, lsl #24
+         movi v0.4h, #1
+         movi v0.4h, #1, lsl #8
+         movi v0.8h, #1
+         movi v0.8h, #1, lsl #8
+
+// CHECK:  movi v0.2s, #0x1           // encoding: [0x20,0x04,0x00,0x0f]
+// CHECK:  movi v1.2s, #0x0           // encoding: [0x01,0x04,0x00,0x0f]
+// CHECK:  movi v15.2s, #0x1, lsl #8  // encoding: [0x2f,0x24,0x00,0x0f]
+// CHECK:  movi v16.2s, #0x1, lsl #16 // encoding: [0x30,0x44,0x00,0x0f]
+// CHECK:  movi v31.2s, #0x1, lsl #24 // encoding: [0x3f,0x64,0x00,0x0f]
+// CHECK:  movi v0.4s, #0x1           // encoding: [0x20,0x04,0x00,0x4f]
+// CHECK:  movi v0.4s, #0x1, lsl #8   // encoding: [0x20,0x24,0x00,0x4f]
+// CHECK:  movi v0.4s, #0x1, lsl #16  // encoding: [0x20,0x44,0x00,0x4f]
+// CHECK:  movi v0.4s, #0x1, lsl #24  // encoding: [0x20,0x64,0x00,0x4f]
+// CHECK:  movi v0.4h, #0x1           // encoding: [0x20,0x84,0x00,0x0f]
+// CHECK:  movi v0.4h, #0x1, lsl #8   // encoding: [0x20,0xa4,0x00,0x0f]
+// CHECK:  movi v0.8h, #0x1           // encoding: [0x20,0x84,0x00,0x4f]
+// CHECK:  movi v0.8h, #0x1, lsl #8   // encoding: [0x20,0xa4,0x00,0x4f]
+
+//----------------------------------------------------------------------
+// Vector Move Inverted Immediate Shifted
+//----------------------------------------------------------------------
+         mvni v0.2s, #1
+         mvni v1.2s, #0
+         mvni v0.2s, #1, lsl #8
+         mvni v0.2s, #1, lsl #16
+         mvni v0.2s, #1, lsl #24
+         mvni v0.4s, #1
+         mvni v15.4s, #1, lsl #8
+         mvni v16.4s, #1, lsl #16
+         mvni v31.4s, #1, lsl #24
+         mvni v0.4h, #1
+         mvni v0.4h, #1, lsl #8
+         mvni v0.8h, #1
+         mvni v0.8h, #1, lsl #8
+
+// CHECK:  mvni v0.2s, #0x1           // encoding: [0x20,0x04,0x00,0x2f]
+// CHECK:  mvni v1.2s, #0x0           // encoding: [0x01,0x04,0x00,0x2f]
+// CHECK:  mvni v0.2s, #0x1, lsl #8   // encoding: [0x20,0x24,0x00,0x2f]
+// CHECK:  mvni v0.2s, #0x1, lsl #16  // encoding: [0x20,0x44,0x00,0x2f]
+// CHECK:  mvni v0.2s, #0x1, lsl #24  // encoding: [0x20,0x64,0x00,0x2f]
+// CHECK:  mvni v0.4s, #0x1           // encoding: [0x20,0x04,0x00,0x6f]
+// CHECK:  mvni v15.4s, #0x1, lsl #8  // encoding: [0x2f,0x24,0x00,0x6f]
+// CHECK:  mvni v16.4s, #0x1, lsl #16 // encoding: [0x30,0x44,0x00,0x6f]
+// CHECK:  mvni v31.4s, #0x1, lsl #24 // encoding: [0x3f,0x64,0x00,0x6f]
+// CHECK:  mvni v0.4h, #0x1           // encoding: [0x20,0x84,0x00,0x2f]
+// CHECK:  mvni v0.4h, #0x1, lsl #8   // encoding: [0x20,0xa4,0x00,0x2f]
+// CHECK:  mvni v0.8h, #0x1           // encoding: [0x20,0x84,0x00,0x6f]
+// CHECK:  mvni v0.8h, #0x1, lsl #8   // encoding: [0x20,0xa4,0x00,0x6f]
+
+//----------------------------------------------------------------------
+// Vector Bitwise Bit Clear (AND NOT) - immediate
+//----------------------------------------------------------------------
+         bic v0.2s, #1
+         bic v1.2s, #0
+         bic v0.2s, #1, lsl #8
+         bic v0.2s, #1, lsl #16
+         bic v0.2s, #1, lsl #24
+         bic v0.4s, #1
+         bic v0.4s, #1, lsl #8
+         bic v0.4s, #1, lsl #16
+         bic v0.4s, #1, lsl #24
+         bic v15.4h, #1
+         bic v16.4h, #1, lsl #8
+         bic v0.8h, #1
+         bic v31.8h, #1, lsl #8
+
+// CHECK:  bic v0.2s, #0x1           // encoding: [0x20,0x14,0x00,0x2f]
+// CHECK:  bic v1.2s, #0x0           // encoding: [0x01,0x14,0x00,0x2f]
+// CHECK:  bic v0.2s, #0x1, lsl #8   // encoding: [0x20,0x34,0x00,0x2f]
+// CHECK:  bic v0.2s, #0x1, lsl #16  // encoding: [0x20,0x54,0x00,0x2f]
+// CHECK:  bic v0.2s, #0x1, lsl #24  // encoding: [0x20,0x74,0x00,0x2f]
+// CHECK:  bic v0.4s, #0x1           // encoding: [0x20,0x14,0x00,0x6f]
+// CHECK:  bic v0.4s, #0x1, lsl #8   // encoding: [0x20,0x34,0x00,0x6f]
+// CHECK:  bic v0.4s, #0x1, lsl #16  // encoding: [0x20,0x54,0x00,0x6f]
+// CHECK:  bic v0.4s, #0x1, lsl #24  // encoding: [0x20,0x74,0x00,0x6f]
+// CHECK:  bic v15.4h, #0x1          // encoding: [0x2f,0x94,0x00,0x2f]
+// CHECK:  bic v16.4h, #0x1, lsl #8  // encoding: [0x30,0xb4,0x00,0x2f]
+// CHECK:  bic v0.8h, #0x1           // encoding: [0x20,0x94,0x00,0x6f]
+// CHECK:  bic v31.8h, #0x1, lsl #8  // encoding: [0x3f,0xb4,0x00,0x6f]
+
+//----------------------------------------------------------------------
+// Vector Bitwise OR - immedidate
+//----------------------------------------------------------------------
+         orr v0.2s, #1
+         orr v1.2s, #0
+         orr v0.2s, #1, lsl #8
+         orr v0.2s, #1, lsl #16
+         orr v0.2s, #1, lsl #24
+         orr v0.4s, #1
+         orr v0.4s, #1, lsl #8
+         orr v0.4s, #1, lsl #16
+         orr v0.4s, #1, lsl #24
+         orr v31.4h, #1
+         orr v15.4h, #1, lsl #8
+         orr v0.8h, #1
+         orr v16.8h, #1, lsl #8
+
+// CHECK:  orr v0.2s, #0x1           // encoding: [0x20,0x14,0x00,0x0f]
+// CHECK:  orr v1.2s, #0x0           // encoding: [0x01,0x14,0x00,0x0f]
+// CHECK:  orr v0.2s, #0x1, lsl #8   // encoding: [0x20,0x34,0x00,0x0f]
+// CHECK:  orr v0.2s, #0x1, lsl #16  // encoding: [0x20,0x54,0x00,0x0f]
+// CHECK:  orr v0.2s, #0x1, lsl #24  // encoding: [0x20,0x74,0x00,0x0f]
+// CHECK:  orr v0.4s, #0x1           // encoding: [0x20,0x14,0x00,0x4f]
+// CHECK:  orr v0.4s, #0x1, lsl #8   // encoding: [0x20,0x34,0x00,0x4f]
+// CHECK:  orr v0.4s, #0x1, lsl #16  // encoding: [0x20,0x54,0x00,0x4f]
+// CHECK:  orr v0.4s, #0x1, lsl #24  // encoding: [0x20,0x74,0x00,0x4f]
+// CHECK:  orr v31.4h, #0x1          // encoding: [0x3f,0x94,0x00,0x0f]
+// CHECK:  orr v15.4h, #0x1, lsl #8  // encoding: [0x2f,0xb4,0x00,0x0f]
+// CHECK:  orr v0.8h, #0x1           // encoding: [0x20,0x94,0x00,0x4f]
+// CHECK:  orr v16.8h, #0x1, lsl #8  // encoding: [0x30,0xb4,0x00,0x4f]
+
+//----------------------------------------------------------------------
+// Vector Move Immediate Masked
+//----------------------------------------------------------------------
+         movi v0.2s, #1, msl #8
+         movi v1.2s, #1, msl #16
+         movi v0.4s, #1, msl #8
+         movi v31.4s, #1, msl #16
+
+// CHECK:  movi v0.2s, #0x1, msl #8   // encoding: [0x20,0xc4,0x00,0x0f]
+// CHECK:  movi v1.2s, #0x1, msl #16  // encoding: [0x21,0xd4,0x00,0x0f]
+// CHECK:  movi v0.4s, #0x1, msl #8   // encoding: [0x20,0xc4,0x00,0x4f]
+// CHECK:  movi v31.4s, #0x1, msl #16 // encoding: [0x3f,0xd4,0x00,0x4f]
+
+//----------------------------------------------------------------------
+// Vector Move Inverted Immediate Masked
+//----------------------------------------------------------------------
+         mvni v1.2s, #0x1, msl #8
+         mvni v0.2s, #0x1, msl #16
+         mvni v31.4s, #0x1, msl #8
+         mvni v0.4s, #0x1, msl #16
+
+// CHECK:   mvni v1.2s, #0x1, msl #8  // encoding: [0x21,0xc4,0x00,0x2f]
+// CHECK:   mvni v0.2s, #0x1, msl #16 // encoding: [0x20,0xd4,0x00,0x2f]
+// CHECK:   mvni v31.4s, #0x1, msl #8 // encoding: [0x3f,0xc4,0x00,0x6f]
+// CHECK:   mvni v0.4s, #0x1, msl #16 // encoding: [0x20,0xd4,0x00,0x6f]
+
+//----------------------------------------------------------------------
+// Vector Immediate - per byte
+//----------------------------------------------------------------------
+         movi v0.8b, #0
+         movi v31.8b, #0xff
+         movi v15.16b, #0xf
+         movi v31.16b, #0x1f
+
+// CHECK:   movi v0.8b, #0x0        // encoding: [0x00,0xe4,0x00,0x0f]
+// CHECK:   movi v31.8b, #0xff      // encoding: [0xff,0xe7,0x07,0x0f]
+// CHECK:   movi v15.16b, #0xf      // encoding: [0xef,0xe5,0x00,0x4f]
+// CHECK:   movi v31.16b, #0x1f     // encoding: [0xff,0xe7,0x00,0x4f]
+
+//----------------------------------------------------------------------
+// Vector Move Immediate - bytemask, per doubleword
+//---------------------------------------------------------------------
+         movi v0.2d, #0xff00ff00ff00ff00
+
+// CHECK: movi v0.2d, #0xff00ff00ff00ff00 // encoding: [0x40,0xe5,0x05,0x6f]
+
+//----------------------------------------------------------------------
+// Vector Move Immediate - bytemask, one doubleword
+//----------------------------------------------------------------------
+         movi d0, #0xff00ff00ff00ff00
+
+// CHECK: movi d0,  #0xff00ff00ff00ff00 // encoding: [0x40,0xe5,0x05,0x2f]
+
+//----------------------------------------------------------------------
+// Vector Floating Point Move Immediate
+//----------------------------------------------------------------------
+         fmov v1.2s, #1.0
+         fmov v15.4s, #1.0
+         fmov v31.2d, #1.0
+
+// CHECK:  fmov v1.2s, #1.00000000     // encoding: [0x01,0xf6,0x03,0x0f]
+// CHECK:  fmov v15.4s, #1.00000000    // encoding: [0x0f,0xf6,0x03,0x4f]
+// CHECK:  fmov v31.2d, #1.00000000    // encoding: [0x1f,0xf6,0x03,0x6f]
+
+
+//----------------------------------------------------------------------
+// Vector Move -  register
+//----------------------------------------------------------------------
+      mov v0.8b, v31.8b
+      mov v15.16b, v16.16b
+      orr v0.8b, v31.8b, v31.8b
+      orr v15.16b, v16.16b, v16.16b
+
+// CHECK:   mov v0.8b, v31.8b     // encoding: [0xe0,0x1f,0xbf,0x0e]
+// CHECK:   mov v15.16b, v16.16b  // encoding: [0x0f,0x1e,0xb0,0x4e]
+// CHECK:   mov v0.8b, v31.8b     // encoding: [0xe0,0x1f,0xbf,0x0e]
+// CHECK:   mov v15.16b, v16.16b  // encoding: [0x0f,0x1e,0xb0,0x4e]
+
diff --git a/llvm/test/MC/AArch64/neon-mul-div-instructions.s b/llvm/test/MC/AArch64/neon-mul-div-instructions.s
new file mode 100644
index 0000000..1fe6d2b
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-mul-div-instructions.s
@@ -0,0 +1,86 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Integer Mul
+//----------------------------------------------------------------------
+         mul v0.8b, v1.8b, v2.8b
+         mul v0.16b, v1.16b, v2.16b
+         mul v0.4h, v1.4h, v2.4h
+         mul v0.8h, v1.8h, v2.8h
+         mul v0.2s, v1.2s, v2.2s
+         mul v0.4s, v1.4s, v2.4s
+
+// CHECK: mul v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x9c,0x22,0x0e]
+// CHECK: mul v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x9c,0x22,0x4e]
+// CHECK: mul v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x9c,0x62,0x0e]
+// CHECK: mul v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x9c,0x62,0x4e]
+// CHECK: mul v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x9c,0xa2,0x0e]
+// CHECK: mul v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x9c,0xa2,0x4e]
+
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Mul
+//----------------------------------------------------------------------
+         fmul v0.2s, v1.2s, v2.2s
+         fmul v0.4s, v1.4s, v2.4s
+         fmul v0.2d, v1.2d, v2.2d
+
+// CHECK: fmul v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xdc,0x22,0x2e]
+// CHECK: fmul v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xdc,0x22,0x6e]
+// CHECK: fmul v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xdc,0x62,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Div
+//----------------------------------------------------------------------
+         fdiv v0.2s, v1.2s, v2.2s
+         fdiv v0.4s, v1.4s, v2.4s
+         fdiv v0.2d, v1.2d, v2.2d
+
+// CHECK: fdiv v0.2s, v1.2s, v2.2s       // encoding: [0x20,0xfc,0x22,0x2e]
+// CHECK: fdiv v0.4s, v1.4s, v2.4s       // encoding: [0x20,0xfc,0x22,0x6e]
+// CHECK: fdiv v0.2d, v1.2d, v2.2d       // encoding: [0x20,0xfc,0x62,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Multiply (Polynomial)
+//----------------------------------------------------------------------
+         pmul v17.8b, v31.8b, v16.8b
+         pmul v0.16b, v1.16b, v2.16b
+
+// CHECK: pmul v17.8b, v31.8b, v16.8b     // encoding: [0xf1,0x9f,0x30,0x2e]
+// CHECK: pmul v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x9c,0x22,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Saturating Doubling Multiply High
+//----------------------------------------------------------------------
+         sqdmulh v2.4h, v25.4h, v3.4h
+         sqdmulh v12.8h, v5.8h, v13.8h
+         sqdmulh v3.2s, v1.2s, v30.2s
+
+// CHECK: sqdmulh v2.4h, v25.4h, v3.4h    // encoding: [0x22,0xb7,0x63,0x0e]
+// CHECK: sqdmulh v12.8h, v5.8h, v13.8h   // encoding: [0xac,0xb4,0x6d,0x4e]
+// CHECK: sqdmulh v3.2s, v1.2s, v30.2s    // encoding: [0x23,0xb4,0xbe,0x0e]
+
+//----------------------------------------------------------------------
+// Vector Saturating Rouding Doubling Multiply High
+//----------------------------------------------------------------------
+         sqrdmulh v2.4h, v25.4h, v3.4h
+         sqrdmulh v12.8h, v5.8h, v13.8h
+         sqrdmulh v3.2s, v1.2s, v30.2s
+
+// CHECK: sqrdmulh v2.4h, v25.4h, v3.4h    // encoding: [0x22,0xb7,0x63,0x2e]
+// CHECK: sqrdmulh v12.8h, v5.8h, v13.8h   // encoding: [0xac,0xb4,0x6d,0x6e]
+// CHECK: sqrdmulh v3.2s, v1.2s, v30.2s    // encoding: [0x23,0xb4,0xbe,0x2e]
+
+//----------------------------------------------------------------------
+// Vector Multiply Extended
+//----------------------------------------------------------------------
+      fmulx v21.2s, v5.2s, v13.2s
+      fmulx v1.4s, v25.4s, v3.4s
+      fmulx v31.2d, v22.2d, v2.2d
+
+// CHECK: fmulx v21.2s, v5.2s, v13.2s // encoding: [0xb5,0xdc,0x2d,0x0e]
+// CHECK: fmulx v1.4s, v25.4s, v3.4s // encoding: [0x21,0xdf,0x23,0x4e]
+// CHECK: fmulx v31.2d, v22.2d, v2.2d // encoding: [0xdf,0xde,0x62,0x4e]
+
diff --git a/llvm/test/MC/AArch64/neon-rounding-halving-add.s b/llvm/test/MC/AArch64/neon-rounding-halving-add.s
new file mode 100644
index 0000000..47ac212
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-rounding-halving-add.s
@@ -0,0 +1,39 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Rouding Halving Add (Signed)
+//------------------------------------------------------------------------------
+         srhadd v0.8b, v1.8b, v2.8b
+         srhadd v0.16b, v1.16b, v2.16b
+         srhadd v0.4h, v1.4h, v2.4h
+         srhadd v0.8h, v1.8h, v2.8h
+         srhadd v0.2s, v1.2s, v2.2s
+         srhadd v0.4s, v1.4s, v2.4s
+
+// CHECK: srhadd v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x14,0x22,0x0e]
+// CHECK: srhadd v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x14,0x22,0x4e]
+// CHECK: srhadd v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x14,0x62,0x0e]
+// CHECK: srhadd v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x14,0x62,0x4e]
+// CHECK: srhadd v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x14,0xa2,0x0e]
+// CHECK: srhadd v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x14,0xa2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Rouding Halving Add (Unsigned)
+//------------------------------------------------------------------------------
+         urhadd v0.8b, v1.8b, v2.8b
+         urhadd v0.16b, v1.16b, v2.16b
+         urhadd v0.4h, v1.4h, v2.4h
+         urhadd v0.8h, v1.8h, v2.8h
+         urhadd v0.2s, v1.2s, v2.2s
+         urhadd v0.4s, v1.4s, v2.4s
+
+// CHECK: urhadd v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x14,0x22,0x2e]
+// CHECK: urhadd v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x14,0x22,0x6e]
+// CHECK: urhadd v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x14,0x62,0x2e]
+// CHECK: urhadd v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x14,0x62,0x6e]
+// CHECK: urhadd v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x14,0xa2,0x2e]
+// CHECK: urhadd v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x14,0xa2,0x6e]
+
diff --git a/llvm/test/MC/AArch64/neon-rounding-shift.s b/llvm/test/MC/AArch64/neon-rounding-shift.s
new file mode 100644
index 0000000..f3c70d7
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-rounding-shift.s
@@ -0,0 +1,57 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Rounding Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         srshl v0.8b, v1.8b, v2.8b
+         srshl v0.16b, v1.16b, v2.16b
+         srshl v0.4h, v1.4h, v2.4h
+         srshl v0.8h, v1.8h, v2.8h
+         srshl v0.2s, v1.2s, v2.2s
+         srshl v0.4s, v1.4s, v2.4s
+         srshl v0.2d, v1.2d, v2.2d
+
+// CHECK: srshl v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x54,0x22,0x0e]
+// CHECK: srshl v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x54,0x22,0x4e]
+// CHECK: srshl v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x54,0x62,0x0e]
+// CHECK: srshl v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x54,0x62,0x4e]
+// CHECK: srshl v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x54,0xa2,0x0e]
+// CHECK: srshl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x54,0xa2,0x4e]
+// CHECK: srshl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x54,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Rounding Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         urshl v0.8b, v1.8b, v2.8b
+         urshl v0.16b, v1.16b, v2.16b
+         urshl v0.4h, v1.4h, v2.4h
+         urshl v0.8h, v1.8h, v2.8h
+         urshl v0.2s, v1.2s, v2.2s
+         urshl v0.4s, v1.4s, v2.4s
+         urshl v0.2d, v1.2d, v2.2d
+
+// CHECK: urshl v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x54,0x22,0x2e]
+// CHECK: urshl v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x54,0x22,0x6e]
+// CHECK: urshl v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x54,0x62,0x2e]
+// CHECK: urshl v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x54,0x62,0x6e]
+// CHECK: urshl v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x54,0xa2,0x2e]
+// CHECK: urshl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x54,0xa2,0x6e]
+// CHECK: urshl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x54,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Rounding Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         srshl d17, d31, d8
+
+// CHECK: srshl d17, d31, d8      // encoding: [0xf1,0x57,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Rounding Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         urshl d17, d31, d8
+
+// CHECK: urshl d17, d31, d8      // encoding: [0xf1,0x57,0xe8,0x7e]
+
diff --git a/llvm/test/MC/AArch64/neon-saturating-add-sub.s b/llvm/test/MC/AArch64/neon-saturating-add-sub.s
new file mode 100644
index 0000000..1032ae4
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-saturating-add-sub.s
@@ -0,0 +1,133 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Add (Signed)
+//------------------------------------------------------------------------------
+         sqadd v0.8b, v1.8b, v2.8b
+         sqadd v0.16b, v1.16b, v2.16b
+         sqadd v0.4h, v1.4h, v2.4h
+         sqadd v0.8h, v1.8h, v2.8h
+         sqadd v0.2s, v1.2s, v2.2s
+         sqadd v0.4s, v1.4s, v2.4s
+         sqadd v0.2d, v1.2d, v2.2d
+
+// CHECK: sqadd v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x0c,0x22,0x0e]
+// CHECK: sqadd v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x0c,0x22,0x4e]
+// CHECK: sqadd v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x0c,0x62,0x0e]
+// CHECK: sqadd v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x0c,0x62,0x4e]
+// CHECK: sqadd v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x0c,0xa2,0x0e]
+// CHECK: sqadd v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x0c,0xa2,0x4e]
+// CHECK: sqadd v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x0c,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Add (Unsigned)
+//------------------------------------------------------------------------------
+         uqadd v0.8b, v1.8b, v2.8b
+         uqadd v0.16b, v1.16b, v2.16b
+         uqadd v0.4h, v1.4h, v2.4h
+         uqadd v0.8h, v1.8h, v2.8h
+         uqadd v0.2s, v1.2s, v2.2s
+         uqadd v0.4s, v1.4s, v2.4s
+         uqadd v0.2d, v1.2d, v2.2d
+
+// CHECK: uqadd v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x0c,0x22,0x2e]
+// CHECK: uqadd v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x0c,0x22,0x6e]
+// CHECK: uqadd v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x0c,0x62,0x2e]
+// CHECK: uqadd v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x0c,0x62,0x6e]
+// CHECK: uqadd v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x0c,0xa2,0x2e]
+// CHECK: uqadd v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x0c,0xa2,0x6e]
+// CHECK: uqadd v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x0c,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Sub (Signed)
+//------------------------------------------------------------------------------
+         sqsub v0.8b, v1.8b, v2.8b
+         sqsub v0.16b, v1.16b, v2.16b
+         sqsub v0.4h, v1.4h, v2.4h
+         sqsub v0.8h, v1.8h, v2.8h
+         sqsub v0.2s, v1.2s, v2.2s
+         sqsub v0.4s, v1.4s, v2.4s
+         sqsub v0.2d, v1.2d, v2.2d
+
+// CHECK: sqsub v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x2c,0x22,0x0e]
+// CHECK: sqsub v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x2c,0x22,0x4e]
+// CHECK: sqsub v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x2c,0x62,0x0e]
+// CHECK: sqsub v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x2c,0x62,0x4e]
+// CHECK: sqsub v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x2c,0xa2,0x0e]
+// CHECK: sqsub v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x2c,0xa2,0x4e]
+// CHECK: sqsub v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x2c,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Sub (Unsigned)
+//------------------------------------------------------------------------------
+         uqsub v0.8b, v1.8b, v2.8b
+         uqsub v0.16b, v1.16b, v2.16b
+         uqsub v0.4h, v1.4h, v2.4h
+         uqsub v0.8h, v1.8h, v2.8h
+         uqsub v0.2s, v1.2s, v2.2s
+         uqsub v0.4s, v1.4s, v2.4s
+         uqsub v0.2d, v1.2d, v2.2d
+
+// CHECK: uqsub v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x2c,0x22,0x2e]
+// CHECK: uqsub v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x2c,0x22,0x6e]
+// CHECK: uqsub v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x2c,0x62,0x2e]
+// CHECK: uqsub v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x2c,0x62,0x6e]
+// CHECK: uqsub v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x2c,0xa2,0x2e]
+// CHECK: uqsub v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x2c,0xa2,0x6e]
+// CHECK: uqsub v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x2c,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Add (Signed)
+//------------------------------------------------------------------------------
+         sqadd b0, b1, b2
+         sqadd h10, h11, h12
+         sqadd s20, s21, s2
+         sqadd d17, d31, d8
+
+// CHECK: sqadd b0, b1, b2        // encoding: [0x20,0x0c,0x22,0x5e]
+// CHECK: sqadd h10, h11, h12     // encoding: [0x6a,0x0d,0x6c,0x5e]
+// CHECK: sqadd s20, s21, s2      // encoding: [0xb4,0x0e,0xa2,0x5e]
+// CHECK: sqadd d17, d31, d8      // encoding: [0xf1,0x0f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Add (Unsigned)
+//------------------------------------------------------------------------------
+         uqadd b0, b1, b2
+         uqadd h10, h11, h12
+         uqadd s20, s21, s2
+         uqadd d17, d31, d8
+
+// CHECK: uqadd b0, b1, b2        // encoding: [0x20,0x0c,0x22,0x7e]
+// CHECK: uqadd h10, h11, h12     // encoding: [0x6a,0x0d,0x6c,0x7e]
+// CHECK: uqadd s20, s21, s2      // encoding: [0xb4,0x0e,0xa2,0x7e]
+// CHECK: uqadd d17, d31, d8      // encoding: [0xf1,0x0f,0xe8,0x7e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Sub (Signed)
+//------------------------------------------------------------------------------
+         sqsub b0, b1, b2
+         sqsub h10, h11, h12
+         sqsub s20, s21, s2
+         sqsub d17, d31, d8
+
+// CHECK: sqsub b0, b1, b2        // encoding: [0x20,0x2c,0x22,0x5e]
+// CHECK: sqsub h10, h11, h12     // encoding: [0x6a,0x2d,0x6c,0x5e]
+// CHECK: sqsub s20, s21, s2      // encoding: [0xb4,0x2e,0xa2,0x5e]
+// CHECK: sqsub d17, d31, d8      // encoding: [0xf1,0x2f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Sub (Unsigned)
+//------------------------------------------------------------------------------
+         uqsub b0, b1, b2
+         uqsub h10, h11, h12
+         uqsub s20, s21, s2
+         uqsub d17, d31, d8
+
+// CHECK: uqsub b0, b1, b2        // encoding: [0x20,0x2c,0x22,0x7e]
+// CHECK: uqsub h10, h11, h12     // encoding: [0x6a,0x2d,0x6c,0x7e]
+// CHECK: uqsub s20, s21, s2      // encoding: [0xb4,0x2e,0xa2,0x7e]
+// CHECK: uqsub d17, d31, d8      // encoding: [0xf1,0x2f,0xe8,0x7e]
+
diff --git a/llvm/test/MC/AArch64/neon-saturating-rounding-shift.s b/llvm/test/MC/AArch64/neon-saturating-rounding-shift.s
new file mode 100644
index 0000000..a36e689
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-saturating-rounding-shift.s
@@ -0,0 +1,70 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Rounding Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         sqrshl v0.8b, v1.8b, v2.8b
+         sqrshl v0.16b, v1.16b, v2.16b
+         sqrshl v0.4h, v1.4h, v2.4h
+         sqrshl v0.8h, v1.8h, v2.8h
+         sqrshl v0.2s, v1.2s, v2.2s
+         sqrshl v0.4s, v1.4s, v2.4s
+         sqrshl v0.2d, v1.2d, v2.2d
+
+// CHECK: sqrshl v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x5c,0x22,0x0e]
+// CHECK: sqrshl v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x5c,0x22,0x4e]
+// CHECK: sqrshl v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x5c,0x62,0x0e]
+// CHECK: sqrshl v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x5c,0x62,0x4e]
+// CHECK: sqrshl v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x5c,0xa2,0x0e]
+// CHECK: sqrshl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x5c,0xa2,0x4e]
+// CHECK: sqrshl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x5c,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Rounding Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         uqrshl v0.8b, v1.8b, v2.8b
+         uqrshl v0.16b, v1.16b, v2.16b
+         uqrshl v0.4h, v1.4h, v2.4h
+         uqrshl v0.8h, v1.8h, v2.8h
+         uqrshl v0.2s, v1.2s, v2.2s
+         uqrshl v0.4s, v1.4s, v2.4s
+         uqrshl v0.2d, v1.2d, v2.2d
+
+// CHECK: uqrshl v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x5c,0x22,0x2e]
+// CHECK: uqrshl v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x5c,0x22,0x6e]
+// CHECK: uqrshl v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x5c,0x62,0x2e]
+// CHECK: uqrshl v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x5c,0x62,0x6e]
+// CHECK: uqrshl v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x5c,0xa2,0x2e]
+// CHECK: uqrshl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x5c,0xa2,0x6e]
+// CHECK: uqrshl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x5c,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Rounding Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         sqrshl b0, b1, b2
+         sqrshl h10, h11, h12
+         sqrshl s20, s21, s2
+         sqrshl d17, d31, d8
+
+// CHECK: sqrshl b0, b1, b2        // encoding: [0x20,0x5c,0x22,0x5e]
+// CHECK: sqrshl h10, h11, h12     // encoding: [0x6a,0x5d,0x6c,0x5e]
+// CHECK: sqrshl s20, s21, s2      // encoding: [0xb4,0x5e,0xa2,0x5e]
+// CHECK: sqrshl d17, d31, d8      // encoding: [0xf1,0x5f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Rounding Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         uqrshl b0, b1, b2
+         uqrshl h10, h11, h12
+         uqrshl s20, s21, s2
+         uqrshl d17, d31, d8
+
+// CHECK: uqrshl b0, b1, b2        // encoding: [0x20,0x5c,0x22,0x7e]
+// CHECK: uqrshl h10, h11, h12     // encoding: [0x6a,0x5d,0x6c,0x7e]
+// CHECK: uqrshl s20, s21, s2      // encoding: [0xb4,0x5e,0xa2,0x7e]
+// CHECK: uqrshl d17, d31, d8      // encoding: [0xf1,0x5f,0xe8,0x7e]
+
+
diff --git a/llvm/test/MC/AArch64/neon-saturating-shift.s b/llvm/test/MC/AArch64/neon-saturating-shift.s
new file mode 100644
index 0000000..2c8456d
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-saturating-shift.s
@@ -0,0 +1,69 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         sqshl v0.8b, v1.8b, v2.8b
+         sqshl v0.16b, v1.16b, v2.16b
+         sqshl v0.4h, v1.4h, v2.4h
+         sqshl v0.8h, v1.8h, v2.8h
+         sqshl v0.2s, v1.2s, v2.2s
+         sqshl v0.4s, v1.4s, v2.4s
+         sqshl v0.2d, v1.2d, v2.2d
+
+// CHECK: sqshl v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x4c,0x22,0x0e]
+// CHECK: sqshl v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x4c,0x22,0x4e]
+// CHECK: sqshl v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x4c,0x62,0x0e]
+// CHECK: sqshl v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x4c,0x62,0x4e]
+// CHECK: sqshl v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x4c,0xa2,0x0e]
+// CHECK: sqshl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x4c,0xa2,0x4e]
+// CHECK: sqshl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x4c,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         uqshl v0.8b, v1.8b, v2.8b
+         uqshl v0.16b, v1.16b, v2.16b
+         uqshl v0.4h, v1.4h, v2.4h
+         uqshl v0.8h, v1.8h, v2.8h
+         uqshl v0.2s, v1.2s, v2.2s
+         uqshl v0.4s, v1.4s, v2.4s
+         uqshl v0.2d, v1.2d, v2.2d
+
+// CHECK: uqshl v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x4c,0x22,0x2e]
+// CHECK: uqshl v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x4c,0x22,0x6e]
+// CHECK: uqshl v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x4c,0x62,0x2e]
+// CHECK: uqshl v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x4c,0x62,0x6e]
+// CHECK: uqshl v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x4c,0xa2,0x2e]
+// CHECK: uqshl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x4c,0xa2,0x6e]
+// CHECK: uqshl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x4c,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         sqshl b0, b1, b2
+         sqshl h10, h11, h12
+         sqshl s20, s21, s2
+         sqshl d17, d31, d8
+
+// CHECK: sqshl b0, b1, b2        // encoding: [0x20,0x4c,0x22,0x5e]
+// CHECK: sqshl h10, h11, h12     // encoding: [0x6a,0x4d,0x6c,0x5e]
+// CHECK: sqshl s20, s21, s2      // encoding: [0xb4,0x4e,0xa2,0x5e]
+// CHECK: sqshl d17, d31, d8      // encoding: [0xf1,0x4f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         uqshl b0, b1, b2
+         uqshl h10, h11, h12
+         uqshl s20, s21, s2
+         uqshl d17, d31, d8
+
+// CHECK: uqshl b0, b1, b2        // encoding: [0x20,0x4c,0x22,0x7e]
+// CHECK: uqshl h10, h11, h12     // encoding: [0x6a,0x4d,0x6c,0x7e]
+// CHECK: uqshl s20, s21, s2      // encoding: [0xb4,0x4e,0xa2,0x7e]
+// CHECK: uqshl d17, d31, d8      // encoding: [0xf1,0x4f,0xe8,0x7e]
+
diff --git a/llvm/test/MC/AArch64/neon-shift.s b/llvm/test/MC/AArch64/neon-shift.s
new file mode 100644
index 0000000..be1799e
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-shift.s
@@ -0,0 +1,57 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         sshl v0.8b, v1.8b, v2.8b
+         sshl v0.16b, v1.16b, v2.16b
+         sshl v0.4h, v1.4h, v2.4h
+         sshl v0.8h, v1.8h, v2.8h
+         sshl v0.2s, v1.2s, v2.2s
+         sshl v0.4s, v1.4s, v2.4s
+         sshl v0.2d, v1.2d, v2.2d
+
+// CHECK: sshl v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x44,0x22,0x0e]
+// CHECK: sshl v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x44,0x22,0x4e]
+// CHECK: sshl v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x44,0x62,0x0e]
+// CHECK: sshl v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x44,0x62,0x4e]
+// CHECK: sshl v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x44,0xa2,0x0e]
+// CHECK: sshl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x44,0xa2,0x4e]
+// CHECK: sshl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x44,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         ushl v0.8b, v1.8b, v2.8b
+         ushl v0.16b, v1.16b, v2.16b
+         ushl v0.4h, v1.4h, v2.4h
+         ushl v0.8h, v1.8h, v2.8h
+         ushl v0.2s, v1.2s, v2.2s
+         ushl v0.4s, v1.4s, v2.4s
+         ushl v0.2d, v1.2d, v2.2d
+
+// CHECK: ushl v0.8b, v1.8b, v2.8b        // encoding: [0x20,0x44,0x22,0x2e]
+// CHECK: ushl v0.16b, v1.16b, v2.16b     // encoding: [0x20,0x44,0x22,0x6e]
+// CHECK: ushl v0.4h, v1.4h, v2.4h        // encoding: [0x20,0x44,0x62,0x2e]
+// CHECK: ushl v0.8h, v1.8h, v2.8h        // encoding: [0x20,0x44,0x62,0x6e]
+// CHECK: ushl v0.2s, v1.2s, v2.2s        // encoding: [0x20,0x44,0xa2,0x2e]
+// CHECK: ushl v0.4s, v1.4s, v2.4s        // encoding: [0x20,0x44,0xa2,0x6e]
+// CHECK: ushl v0.2d, v1.2d, v2.2d        // encoding: [0x20,0x44,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Shift Lef (Signed)
+//------------------------------------------------------------------------------
+         sshl d17, d31, d8
+
+// CHECK: sshl d17, d31, d8      // encoding: [0xf1,0x47,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+         ushl d17, d31, d8
+
+// CHECK: ushl d17, d31, d8      // encoding: [0xf1,0x47,0xe8,0x7e]
+
diff --git a/llvm/test/MC/AArch64/noneon-diagnostics.s b/llvm/test/MC/AArch64/noneon-diagnostics.s
new file mode 100644
index 0000000..ea786c0
--- /dev/null
+++ b/llvm/test/MC/AArch64/noneon-diagnostics.s
@@ -0,0 +1,28 @@
+// RUN: not llvm-mc  -triple aarch64-none-linux-gnu -mattr=-neon < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+
+        fmla v3.4s, v12.4s, v17.4s
+        fmla v1.2d, v30.2d, v20.2d
+        fmla v9.2s, v9.2s, v0.2s
+// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT:    fmla v3.4s, v12.4s, v17.4s
+// CHECK-ERROR-NEXT:    ^
+// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT:    fmla v1.2d, v30.2d, v20.2d
+// CHECK-ERROR-NEXT:    ^
+// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT:    fmla v9.2s, v9.2s, v0.2s
+// CHECK-ERROR-NEXT:    ^
+
+        fmls v3.4s, v12.4s, v17.4s
+        fmls v1.2d, v30.2d, v20.2d
+        fmls v9.2s, v9.2s, v0.2s
+// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT:    fmls v3.4s, v12.4s, v17.4s
+// CHECK-ERROR-NEXT:    ^
+// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT:    fmls v1.2d, v30.2d, v20.2d
+// CHECK-ERROR-NEXT:    ^
+// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT:    fmls v9.2s, v9.2s, v0.2s
+// CHECK-ERROR-NEXT:    ^
diff --git a/llvm/test/MC/Disassembler/AArch64/neon-instructions.txt b/llvm/test/MC/Disassembler/AArch64/neon-instructions.txt
new file mode 100644
index 0000000..40d1f4c
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AArch64/neon-instructions.txt
@@ -0,0 +1,673 @@
+# RUN: llvm-mc  -triple aarch64-none-linux-gnu -mattr=+neon -disassemble < %s | FileCheck %s
+
+#------------------------------------------------------------------------------
+# Vector Integer Add/Sub
+#------------------------------------------------------------------------------
+# CHECK: add v31.8b, v31.8b, v31.8b
+# CHECK: sub v0.2d, v0.2d, v0.2d
+0xff 0x87 0x3f 0x0e
+0x00 0x84 0xe0 0x6e
+
+#------------------------------------------------------------------------------
+# Vector Floating-Point Add/Sub
+#------------------------------------------------------------------------------
+
+# CHECK: fadd v0.4s, v0.4s, v0.4s
+# CHECK: fsub v31.2s, v31.2s, v31.2s
+0x00 0xd4 0x20 0x4e
+0xff 0xd7 0xbf 0x0e
+
+#------------------------------------------------------------------------------
+# Vector Integer Mul
+#------------------------------------------------------------------------------
+# CHECK: mul v0.8b, v1.8b, v2.8b
+0x20 0x9c 0x22 0x0e
+
+#------------------------------------------------------------------------------
+# Vector Floating-Point Mul/Div
+#------------------------------------------------------------------------------
+# CHECK: fmul v0.2s, v1.2s, v2.2s
+# CHECK: fdiv v31.2s, v31.2s, v31.2s
+0x20 0xdc 0x22 0x2e
+0xff 0xff 0x3f 0x2e
+
+#----------------------------------------------------------------------
+# Vector Polynomial Multiply
+#----------------------------------------------------------------------
+# CHECK: pmul v0.8b, v15.8b, v16.8b
+# CHECK: pmul v31.16b, v7.16b, v8.16b
+0xe0 0x9d 0x30 0x2e
+0xff 0x9c 0x28 0x6e
+
+#------------------------------------------------------------------------------
+# Vector And, Orr, Eor, Orn, Bic
+#------------------------------------------------------------------------------
+# CHECK: and v2.8b, v2.8b, v2.8b
+# CHECK: orr v31.16b, v31.16b, v30.16b
+# CHECK: eor v0.16b, v1.16b, v2.16b
+# CHECK: orn v9.16b, v10.16b, v11.16b
+# CHECK: bic v31.8b, v30.8b, v29.8b
+0x42 0x1c 0x22 0x0e
+0xff 0x1f 0xbe 0x4e
+0x20 0x1c 0x22 0x6e
+0x49 0x1d 0xeb 0x4e
+0xdf 0x1f 0x7d 0x0e
+
+#------------------------------------------------------------------------------
+# Vector Bsl, Bit, Bif
+#------------------------------------------------------------------------------
+# CHECK: bsl v0.8b, v1.8b, v2.8b
+# CHECK: bit v31.16b, v31.16b, v31.16b
+# CHECK: bif v0.16b, v1.16b, v2.16b
+0x20 0x1c 0x62 0x2e
+0xff 0x1f 0xbf 0x6e
+0x20 0x1c 0xe2 0x6e
+
+
+#------------------------------------------------------------------------------
+# Vector Integer Multiply-accumulate and Multiply-subtract
+#------------------------------------------------------------------------------
+# CHECK: mla v0.8b, v1.8b, v2.8b
+# CHECK: mls v31.4h, v31.4h, v31.4h
+0x20 0x94 0x22 0x0e
+0xff 0x97 0x7f 0x2e
+
+#------------------------------------------------------------------------------
+# Vector Floating-Point Multiply-accumulate and Multiply-subtract
+#------------------------------------------------------------------------------
+# CHECK: fmla v0.2s, v1.2s, v2.2s
+# CHECK: fmls v31.2s, v31.2s, v31.2s
+0x20 0xcc 0x22 0x0e
+0xff 0xcf 0xbf 0x0e
+
+#------------------------------------------------------------------------------
+# Vector Move Immediate Shifted
+# Vector Move Inverted Immediate Shifted
+# Vector Bitwise Bit Clear (AND NOT) - immediate
+# Vector Bitwise OR - immedidate
+#------------------------------------------------------------------------------
+# CHECK: movi v31.4s, #0xff, lsl #24
+# CHECK: mvni v0.2s, #0x0
+# CHECK: bic v15.4h, #0xf, lsl #8
+# CHECK: orr v16.8h, #0x1f
+0xff 0x67 0x07 0x4f
+0x00 0x04 0x00 0x2f
+0xef 0xb5 0x00 0x2f
+0xf0 0x97 0x00 0x4f
+
+#------------------------------------------------------------------------------
+# Vector Move Immediate Masked
+# Vector Move Inverted Immediate Masked
+#------------------------------------------------------------------------------
+# CHECK: movi v8.2s, #0x8, msl #8
+# CHECK: mvni v16.4s, #0x10, msl #16
+0x08 0xc5 0x00 0x0f
+0x10 0xd6 0x00 0x6f
+
+#------------------------------------------------------------------------------
+# Vector Immediate - per byte
+# Vector Move Immediate - bytemask, per doubleword
+# Vector Move Immediate - bytemask, one doubleword
+#------------------------------------------------------------------------------
+# CHECK: movi v16.8b, #0xff
+# CHECK: movi v31.16b, #0x1f
+# CHECK: movi d15, #0xff00ff00ff00ff
+# CHECK: movi v31.2d, #0xff0000ff0000ffff
+0xf0 0xe7 0x07 0x0f
+0xff 0xe7 0x00 0x4f
+0xaf 0xe6 0x02 0x2f
+0x7f 0xe6 0x04 0x6f
+
+#------------------------------------------------------------------------------
+# Vector Floating Point Move Immediate
+#------------------------------------------------------------------------------
+# CHECK: fmov v0.2s, #13.0
+# CHECK: fmov v15.4s, #1.0
+# CHECK: fmov v31.2d, #-1.25
+0x40 0xf5 0x01 0x0f
+0x0f 0xf6 0x03 0x4f
+0x9f 0xf6 0x07 0x6f
+
+#------------------------------------------------------------------------------
+# Vector Move - register
+#------------------------------------------------------------------------------
+# CHECK: mov v1.16b, v15.16b
+# CHECK: mov v25.8b, v4.8b
+0xe1 0x1d 0xaf 0x4e
+0x99 0x1c 0xa4 0x0e
+
+#----------------------------------------------------------------------
+# Vector Absolute Difference and Accumulate (Signed, Unsigned)
+# Vector Absolute Difference (Signed, Unsigned)
+# Vector Absolute Difference (Floating Point)
+#----------------------------------------------------------------------
+
+# CHECK: uaba v0.8b, v1.8b, v2.8b
+# CHECK: saba v31.16b, v30.16b, v29.16b
+# CHECK: uabd v15.4h, v16.4h, v17.4h
+# CHECK: sabd v5.4h, v4.4h, v6.4h
+# CHECK: fabd v1.4s, v31.4s, v16.4s
+0x20 0x7c 0x22 0x2e
+0xdf 0x7f 0x3d 0x4e
+0x0f 0x76 0x71 0x2e
+0x85 0x74 0x66 0x0e
+0xe1 0xd7 0xb0 0x6e
+
+#----------------------------------------------------------------------
+# Scalar Integer Add
+# Scalar Integer Sub
+#----------------------------------------------------------------------
+
+# CHECK: add d17, d31, d29
+# CHECK: sub d15, d5, d16
+0xf1 0x87 0xfd 0x5e
+0xaf 0x84 0xf0 0x7e
+
+#----------------------------------------------------------------------
+# Vector Reciprocal Square Root Step (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: frsqrts v31.2d, v15.2d, v8.2d
+0xff 0xfd 0xe8 0x4e
+
+#----------------------------------------------------------------------
+# Vector Reciprocal Step (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: frecps  v5.4s, v7.4s, v16.4s
+0xe5 0xfc 0x30 0x4e
+
+#----------------------------------------------------------------------
+# Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: facge v0.4s, v31.4s, v16.4s
+0xe0 0xef 0x30 0x6e
+
+#----------------------------------------------------------------------
+# Vector Absolute Compare Mask Less Than (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: facgt v31.2d, v29.2d, v28.2d
+0xbf 0xef 0xfc 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Equal (Integer)
+#----------------------------------------------------------------------
+# CHECK: cmeq v5.16b, v15.16b, v31.16b
+0xe5 0x8d 0x3f 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Higher or Same (Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: cmhs v1.8b, v16.8b, v30.8b
+0x01 0x3e 0x3e 0x2e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than or Equal (Integer)
+#----------------------------------------------------------------------
+# CHECK: cmge v20.4h, v11.4h, v23.4h
+0x74 0x3d 0x77 0x0e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Higher (Unsigned Integer)
+# CHECK: cmhi v13.8h, v3.8h, v27.8h
+0x6d 0x34 0x7b 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than (Integer)
+#----------------------------------------------------------------------
+# CHECK: cmgt v9.4s, v4.4s, v28.4s
+0x89 0x34 0xbc 0x4e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Bitwise Test (Integer)
+#----------------------------------------------------------------------
+# CHECK: cmtst v21.2s, v19.2s, v18.2s
+0x75 0x8e 0xb2 0x0e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Equal (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmeq v0.2s, v15.2s, v16.2s
+0xe0 0xe5 0x30 0x0e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than Or Equal (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmge v31.4s, v7.4s, v29.4s
+0xff 0xe4 0x3d 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmgt v17.4s, v8.4s, v25.4s
+0x11 0xe5 0xb9 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Equal to Zero (Integer)
+#----------------------------------------------------------------------
+# CHECK: cmeq v31.16b, v15.16b, #0x0
+0xff 0x99 0x20 0x4e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
+#----------------------------------------------------------------------
+# CHECK: cmge v3.8b, v15.8b, #0x0
+0xe3 0x89 0x20 0x2e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than Zero (Signed Integer)
+#----------------------------------------------------------------------
+# CHECK: cmgt v22.2s, v9.2s, #0x0
+0x36 0x89 0xa0 0x0e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
+#----------------------------------------------------------------------
+# CHECK: cmle v5.2d, v14.2d, #0x0
+0xc5 0x99 0xe0 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Less Than Zero (Signed Integer)
+#----------------------------------------------------------------------
+# CHECK: cmlt v13.8h, v11.8h, #0x0
+0x6d 0xa9 0x60 0x4e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Equal to Zero (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmeq v15.2s, v21.2s, #0.0
+0xaf 0xda 0xa0 0x0e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmge v14.2d, v13.2d, #0.0
+0xae 0xc9 0xe0 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than Zero (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmgt v9.4s, v23.4s, #0.0
+0xe9 0xca 0xa0 0x4e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Less Than or Equal To Zero (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmle v11.2d, v6.2d, #0.0
+0xcb 0xd8 0xe0 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Less Than Zero (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmlt v12.4s, v25.4s, #0.0
+0x2c 0xeb 0xa0 0x4e
+
+
+#------------------------------------------------------------------------------
+# Vector Integer Halving Add (Signed)
+# Vector Integer Halving Add (Unsigned)
+# Vector Integer Halving Sub (Signed)
+# Vector Integer Halving Sub (Unsigned)
+#------------------------------------------------------------------------------
+# CHECK: shadd v0.8b, v31.8b, v29.8b
+# CHECK: uhadd v15.16b, v16.16b, v17.16b
+# CHECK: shsub v0.4h, v1.4h, v2.4h
+# CHECK: uhadd v5.8h, v7.8h, v8.8h
+# CHECK: shsub v9.2s, v11.2s, v21.2s
+# CHECK: uhsub v22.4s, v30.4s, v19.4s
+0xe0 0x07 0x3d 0x0e
+0x0f 0x06 0x31 0x6e
+0x20 0x24 0x62 0x0e
+0xe5 0x04 0x68 0x6e
+0x69 0x25 0xb5 0x0e
+0xd6 0x27 0xb3 0x6e
+
+#------------------------------------------------------------------------------
+# Vector Integer Rouding Halving Add (Signed)
+# Vector Integer Rouding Halving Add (Unsigned)
+#------------------------------------------------------------------------------
+# CHECK: srhadd v3.8b, v5.8b, v7.8b
+# CHECK: urhadd v7.16b, v17.16b, v27.16b
+# CHECK: srhadd v10.4h, v11.4h, v13.4h
+# CHECK: urhadd v1.8h, v2.8h, v3.8h
+# CHECK: srhadd v4.2s, v5.2s, v6.2s
+# CHECK: urhadd v7.4s, v7.4s, v7.4s
+0xa3 0x14 0x27 0x0e
+0x27 0x16 0x3b 0x6e
+0x6a 0x15 0x6d 0x0e
+0x41 0x14 0x63 0x6e
+0xa4 0x14 0xa6 0x0e
+0xe7 0x14 0xa7 0x6e
+
+#------------------------------------------------------------------------------
+# Vector Integer Saturating Add (Signed)
+# Vector Integer Saturating Add (Unsigned)
+# Vector Integer Saturating Sub (Signed)
+# Vector Integer Saturating Sub (Unsigned)
+#------------------------------------------------------------------------------
+# CHECK: sqsub v0.8b, v1.8b, v2.8b
+# CHECK: sqadd v0.16b, v1.16b, v2.16b
+# CHECK: uqsub v0.4h, v1.4h, v2.4h
+# CHECK: uqadd v0.8h, v1.8h, v2.8h
+# CHECK: sqadd v0.2s, v1.2s, v2.2s
+# CHECK: sqsub v0.4s, v1.4s, v2.4s
+# CHECK: sqsub v0.2d, v1.2d, v2.2d
+0x20 0x2c 0x22 0x0e
+0x20 0x0c 0x22 0x4e
+0x20 0x2c 0x62 0x2e
+0x20 0x0c 0x62 0x6e
+0x20 0x0c 0xa2 0x0e
+0x20 0x2c 0xa2 0x4e
+0x20 0x2c 0xe2 0x4e
+
+#------------------------------------------------------------------------------
+# Scalar Integer Saturating Add (Signed)
+# Scalar Integer Saturating Add (Unsigned)
+# Scalar Integer Saturating Sub (Signed)
+# Scalar Integer Saturating Add (Unsigned)
+#------------------------------------------------------------------------------
+# CHECK: sqadd b20, b11, b15
+# CHECK: uqadd h0, h1, h5
+# CHECK: sqsub s20, s10, s7
+# CHECK: uqsub d16, d16, d16
+0x74 0x0d 0x2f 0x5e
+0x20 0x0c 0x65 0x7e
+0x54 0x2d 0xa7 0x5e
+0x10 0x2e 0xf0 0x7e
+
+
+#----------------------------------------------------------------------
+# Vector Shift Left (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: sshl v10.8b, v15.8b, v22.8b
+# CHECK: ushl v10.16b, v5.16b, v2.16b
+# CHECK: sshl v10.4h, v15.4h, v22.4h
+# CHECK: ushl v10.8h, v5.8h, v2.8h
+# CHECK: sshl v10.2s, v15.2s, v22.2s
+# CHECK: ushl v10.4s, v5.4s, v2.4s
+# CHECK: sshl v0.2d, v1.2d, v2.2d
+0xea 0x45 0x36 0x0e
+0xaa 0x44 0x22 0x6e
+0xea 0x45 0x76 0x0e
+0xaa 0x44 0x62 0x6e
+0xea 0x45 0xb6 0x0e
+0xaa 0x44 0xa2 0x6e
+0x20 0x44 0xe2 0x4e
+
+#----------------------------------------------------------------------
+# Vector Saturating Shift Left (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: sqshl v1.8b, v15.8b, v22.8b
+# CHECK: uqshl v2.16b, v14.16b, v23.16b
+# CHECK: sqshl v3.4h, v13.4h, v24.4h
+# CHECK: uqshl v4.8h, v12.8h, v25.8h
+# CHECK: sqshl v5.2s, v11.2s, v26.2s
+# CHECK: uqshl v6.4s, v10.4s, v27.4s
+# CHECK: uqshl v0.2d, v1.2d, v2.2d
+0xe1 0x4d 0x36 0x0e
+0xc2 0x4d 0x37 0x6e
+0xa3 0x4d 0x78 0x0e
+0x84 0x4d 0x79 0x6e
+0x65 0x4d 0xba 0x0e
+0x46 0x4d 0xbb 0x6e
+0x20 0x4c 0xe2 0x6e
+
+#----------------------------------------------------------------------
+# Vector Rouding Shift Left (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: srshl v10.8b, v5.8b, v22.8b
+# CHECK: urshl v10.16b, v5.16b, v2.16b
+# CHECK: srshl v1.4h, v5.4h, v31.4h
+# CHECK: urshl v1.8h, v5.8h, v2.8h
+# CHECK: srshl v10.2s, v15.2s, v2.2s
+# CHECK: urshl v1.4s, v5.4s, v2.4s
+# CHECK: urshl v0.2d, v1.2d, v2.2d
+0xaa 0x54 0x36 0x0e
+0xaa 0x54 0x22 0x6e
+0xa1 0x54 0x7f 0x0e
+0xa1 0x54 0x62 0x6e
+0xea 0x55 0xa2 0x0e
+0xa1 0x54 0xa2 0x6e
+0x20 0x54 0xe2 0x6e
+
+#----------------------------------------------------------------------
+# Vector Saturating Rouding Shift Left (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: sqrshl v1.8b, v15.8b, v22.8b
+# CHECK: uqrshl v2.16b, v14.16b, v23.16b
+# CHECK: sqrshl v3.4h, v13.4h, v24.4h
+# CHECK: uqrshl v4.8h, v12.8h, v25.8h
+# CHECK: sqrshl v5.2s, v11.2s, v26.2s
+# CHECK: uqrshl v6.4s, v10.4s, v27.4s
+# CHECK: uqrshl v6.4s, v10.4s, v27.4s
+0xe1 0x5d 0x36 0x0e
+0xc2 0x5d 0x37 0x6e
+0xa3 0x5d 0x78 0x0e
+0x84 0x5d 0x79 0x6e
+0x65 0x5d 0xba 0x0e
+0x46 0x5d 0xbb 0x6e
+0x46 0x5d 0xbb 0x6e
+
+#----------------------------------------------------------------------
+# Scalar Integer Shift Left (Signed, Unsigned)
+#----------------------------------------------------------------------
+# CHECK: sshl d31, d31, d31
+# CHECK: ushl d0, d0, d0
+0xff 0x47 0xff 0x5e
+0x00 0x44 0xe0 0x7e
+
+#----------------------------------------------------------------------
+# Scalar Integer Saturating Shift Left (Signed, Unsigned)
+#----------------------------------------------------------------------
+# CHECK: sqshl d31, d31, d31
+# CHECK: uqshl s23, s20, s16
+# CHECK: sqshl h3, h4, h15
+# CHECK: uqshl b11, b20, b30
+0xff 0x4f 0xff 0x5e
+0x97 0x4e 0xb0 0x7e
+0x83 0x4c 0x6f 0x5e
+0x8b 0x4e 0x3e 0x7e
+
+#----------------------------------------------------------------------
+# Scalar Integer Rouding Shift Left (Signed, Unsigned)
+#----------------------------------------------------------------------
+# CHECK: srshl d16, d16, d16
+# CHECK: urshl d8, d7, d4
+0x10 0x56 0xf0 0x5e
+0xe8 0x54 0xe4 0x7e
+
+#----------------------------------------------------------------------
+# Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+#----------------------------------------------------------------------
+# CHECK: sqrshl d31, d31, d31
+# CHECK: uqrshl s23, s20, s16
+# CHECK: sqrshl h3, h4, h15
+# CHECK: uqrshl b11, b20, b30
+0xff 0x5f 0xff 0x5e
+0x97 0x5e 0xb0 0x7e
+0x83 0x5c 0x6f 0x5e
+0x8b 0x5e 0x3e 0x7e
+
+#----------------------------------------------------------------------
+# Vector Maximum (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: smax v1.8b, v15.8b, v22.8b
+# CHECK: umax v2.16b, v14.16b, v23.16b
+# CHECK: smax v3.4h, v13.4h, v24.4h
+# CHECK: umax v4.8h, v12.8h, v25.8h
+# CHECK: smax v5.2s, v11.2s, v26.2s
+# CHECK: umax v6.4s, v10.4s, v27.4s
+0xe1 0x65 0x36 0x0e
+0xc2 0x65 0x37 0x6e
+0xa3 0x65 0x78 0x0e
+0x84 0x65 0x79 0x6e
+0x65 0x65 0xba 0x0e
+0x46 0x65 0xbb 0x6e
+
+#----------------------------------------------------------------------
+# Vector Minimum (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: umin v1.8b, v15.8b, v22.8b
+# CHECK: smin v2.16b, v14.16b, v23.16b
+# CHECK: umin v3.4h, v13.4h, v24.4h
+# CHECK: smin v4.8h, v12.8h, v25.8h
+# CHECK: umin v5.2s, v11.2s, v26.2s
+# CHECK: smin v6.4s, v10.4s, v27.4s
+0xe1 0x6d 0x36 0x2e
+0xc2 0x6d 0x37 0x4e
+0xa3 0x6d 0x78 0x2e
+0x84 0x6d 0x79 0x4e
+0x65 0x6d 0xba 0x2e
+0x46 0x6d 0xbb 0x4e
+
+#----------------------------------------------------------------------
+# Vector Maximum (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fmax v29.2s, v28.2s, v25.2s
+# CHECK: fmax v9.4s, v8.4s, v5.4s
+# CHECK: fmax v11.2d, v10.2d, v7.2d
+0x9d 0xf7 0x39 0x0e
+0x09 0xf5 0x25 0x4e
+0x4b 0xf5 0x67 0x4e
+
+#----------------------------------------------------------------------
+# Vector Minimum (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fmin v29.2s, v28.2s, v25.2s
+# CHECK: fmin v9.4s, v8.4s, v5.4s
+# CHECK: fmin v11.2d, v10.2d, v7.2d
+0x9d 0xf7 0xb9 0x0e
+0x09 0xf5 0xa5 0x4e
+0x4b 0xf5 0xe7 0x4e
+
+#----------------------------------------------------------------------
+# Vector maxNum (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fmaxnm v9.2s, v8.2s, v5.2s
+# CHECK: fmaxnm v9.4s, v8.4s, v5.4s
+# CHECK: fmaxnm v11.2d, v10.2d, v7.2d
+0x09 0xc5 0x25 0x0e
+0x09 0xc5 0x25 0x4e
+0x4b 0xc5 0x67 0x4e
+
+#----------------------------------------------------------------------
+# Vector minNum (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fminnm v2.2s, v8.2s, v25.2s
+# CHECK: fminnm v9.4s, v8.4s, v5.4s
+# CHECK: fminnm v11.2d, v10.2d, v7.2d
+0x02 0xc5 0xb9 0x0e
+0x09 0xc5 0xa5 0x4e
+0x4b 0xc5 0xe7 0x4e
+
+
+#----------------------------------------------------------------------
+# Vector Maximum Pairwise (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: smaxp v1.8b, v15.8b, v22.8b
+# CHECK: umaxp v2.16b, v14.16b, v23.16b
+# CHECK: smaxp v3.4h, v13.4h, v24.4h
+# CHECK: umaxp v4.8h, v12.8h, v25.8h
+# CHECK: smaxp v5.2s, v11.2s, v26.2s
+# CHECK: umaxp v6.4s, v10.4s, v27.4s
+0xe1 0xa5 0x36 0x0e
+0xc2 0xa5 0x37 0x6e
+0xa3 0xa5 0x78 0x0e
+0x84 0xa5 0x79 0x6e
+0x65 0xa5 0xba 0x0e
+0x46 0xa5 0xbb 0x6e
+
+#----------------------------------------------------------------------
+# Vector Minimum Pairwise (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: uminp v1.8b, v15.8b, v22.8b
+# CHECK: sminp v2.16b, v14.16b, v23.16b
+# CHECK: uminp v3.4h, v13.4h, v24.4h
+# CHECK: sminp v4.8h, v12.8h, v25.8h
+# CHECK: uminp v5.2s, v11.2s, v26.2s
+# CHECK: sminp v6.4s, v10.4s, v27.4s
+0xe1 0xad 0x36 0x2e
+0xc2 0xad 0x37 0x4e
+0xa3 0xad 0x78 0x2e
+0x84 0xad 0x79 0x4e
+0x65 0xad 0xba 0x2e
+0x46 0xad 0xbb 0x4e
+
+#----------------------------------------------------------------------
+# Vector Maximum Pairwise (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fmaxp v29.2s, v28.2s, v25.2s
+# CHECK: fmaxp v9.4s, v8.4s, v5.4s
+# CHECK: fmaxp v11.2d, v10.2d, v7.2d
+0x9d 0xf7 0x39 0x2e
+0x09 0xf5 0x25 0x6e
+0x4b 0xf5 0x67 0x6e
+
+#----------------------------------------------------------------------
+# Vector Minimum Pairwise (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fminp v29.2s, v28.2s, v25.2s
+# CHECK: fminp v9.4s, v8.4s, v5.4s
+# CHECK: fminp v11.2d, v10.2d, v7.2d
+0x9d 0xf7 0xb9 0x2e
+0x09 0xf5 0xa5 0x6e
+0x4b 0xf5 0xe7 0x6e
+
+#----------------------------------------------------------------------
+# Vector maxNum Pairwise (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fmaxnmp v9.2s, v8.2s, v5.2s
+# CHECK: fmaxnmp v9.4s, v8.4s, v5.4s
+# CHECK: fmaxnmp v11.2d, v10.2d, v7.2d
+0x09 0xc5 0x25 0x2e
+0x09 0xc5 0x25 0x6e
+0x4b 0xc5 0x67 0x6e
+
+#----------------------------------------------------------------------
+# Vector minNum Pairwise (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fminnmp v2.2s, v8.2s, v25.2s
+# CHECK: fminnmp v9.4s, v8.4s, v5.4s
+# CHECK: fminnmp v11.2d, v10.2d, v7.2d
+0x02 0xc5 0xb9 0x2e
+0x09 0xc5 0xa5 0x6e
+0x4b 0xc5 0xe7 0x6e
+
+#------------------------------------------------------------------------------
+# Vector Add Pairwise (Integer)
+#------------------------------------------------------------------------------
+# CHECK: addp v31.8b, v31.8b, v31.8b
+# CHECK: addp v0.2d, v0.2d, v0.2d
+0xff 0xbf 0x3f 0x0e
+0x00 0xbc 0xe0 0x4e
+
+#------------------------------------------------------------------------------
+# Vector Add Pairwise (Floating Point)
+#------------------------------------------------------------------------------
+# CHECK: faddp v0.4s, v0.4s, v0.4s
+# CHECK: faddp v31.2s, v31.2s, v31.2s
+0x00 0xd4 0x20 0x6e
+0xff 0xd7 0x3f 0x2e
+
+
+#------------------------------------------------------------------------------
+# Vector Saturating Doubling Multiply High
+# Vector Saturating Rouding Doubling Multiply High
+#------------------------------------------------------------------------------
+# CHECK: sqdmulh v31.2s, v31.2s, v31.2s
+# CHECK: sqdmulh v5.4s, v7.4s, v9.4s
+# CHECK: sqrdmulh v31.4h, v3.4h, v13.4h
+# CHECK: sqrdmulh v0.8h, v10.8h, v20.8h
+0xff 0xb7 0xbf 0x0e
+0xe5 0xb4 0xa9 0x4e
+0x7f 0xb4 0x6d 0x2e
+0x40 0xb5 0x74 0x6e
+
+#------------------------------------------------------------------------------
+# Vector Multiply Extended
+#------------------------------------------------------------------------------
+# CHECK: fmulx v1.2s, v22.2s, v2.2s
+# CHECK: fmulx v21.4s, v15.4s, v3.4s
+# CHECK: fmulx v11.2d, v5.2d, v23.2d
+0xc1 0xde 0x22 0x0e
+0xf5 0xdd 0x23 0x4e
+0xab 0xdc 0x77 0x4e
+