[ARM] MVE VMOV.i64

In the original batch of MVE VMOVimm code generation VMOV.i64 was left
out due to the way it was done downstream. It turns out that it's fairly
simple though. This adds the codegen for it, similar to NEON.

Bigendian is technically incorrect in this version, which John is fixing
in a Neon patch.
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
index 9b6d668..690e2c3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
@@ -172,8 +172,7 @@
 define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-LABEL: add_v2i16_v2i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    adr r0, .LCPI12_0
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.i64 q2, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s4
@@ -183,13 +182,6 @@
 ; CHECK-NEXT:    umull r0, r1, r1, r0
 ; CHECK-NEXT:    umlal r0, r1, r3, r2
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI12_0:
-; CHECK-NEXT:    .long 65535 @ 0xffff
-; CHECK-NEXT:    .long 0 @ 0x0
-; CHECK-NEXT:    .long 65535 @ 0xffff
-; CHECK-NEXT:    .long 0 @ 0x0
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %yy = zext <2 x i16> %y to <2 x i64>
@@ -507,11 +499,10 @@
 ; CHECK-NEXT:    vmov.u8 r1, q0[0]
 ; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[1]
-; CHECK-NEXT:    vmov.32 q3[2], r0
-; CHECK-NEXT:    adr r0, .LCPI23_0
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vmov.32 q4[0], r1
 ; CHECK-NEXT:    vmov.u8 r1, q0[1]
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vmov.32 q4[2], r1
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
@@ -703,13 +694,6 @@
 ; CHECK-NEXT:    umlal r0, r1, r3, r2
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r7, pc}
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI23_0:
-; CHECK-NEXT:    .long 255 @ 0xff
-; CHECK-NEXT:    .long 0 @ 0x0
-; CHECK-NEXT:    .long 255 @ 0xff
-; CHECK-NEXT:    .long 0 @ 0x0
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
@@ -888,8 +872,7 @@
 define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: add_v2i8_v2i64_zext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    adr r0, .LCPI25_0
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r0, s6
@@ -901,13 +884,6 @@
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    orrs r1, r3
 ; CHECK-NEXT:    bx lr
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI25_0:
-; CHECK-NEXT:    .long 255 @ 0xff
-; CHECK-NEXT:    .long 0 @ 0x0
-; CHECK-NEXT:    .long 255 @ 0xff
-; CHECK-NEXT:    .long 0 @ 0x0
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %yy = zext <2 x i8> %y to <2 x i64>
@@ -1162,8 +1138,7 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    adr r2, .LCPI40_0
-; CHECK-NEXT:    vldrw.u32 q2, [r2]
+; CHECK-NEXT:    vmov.i64 q2, #0xffff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s4
@@ -1175,13 +1150,6 @@
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, lr
 ; CHECK-NEXT:    pop {r7, pc}
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI40_0:
-; CHECK-NEXT:    .long 65535 @ 0xffff
-; CHECK-NEXT:    .long 0 @ 0x0
-; CHECK-NEXT:    .long 65535 @ 0xffff
-; CHECK-NEXT:    .long 0 @ 0x0
 entry:
   %xx = zext <2 x i16> %x to <2 x i64>
   %yy = zext <2 x i16> %y to <2 x i64>
@@ -1514,17 +1482,16 @@
 ; CHECK-NEXT:    vmov.u8 r3, q0[0]
 ; CHECK-NEXT:    vmov.32 q3[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q1[1]
-; CHECK-NEXT:    vmov.32 q3[2], r2
-; CHECK-NEXT:    adr r2, .LCPI51_0
-; CHECK-NEXT:    vldrw.u32 q2, [r2]
 ; CHECK-NEXT:    vmov.32 q4[0], r3
 ; CHECK-NEXT:    vmov.u8 r3, q0[1]
-; CHECK-NEXT:    vmov.u8 r4, q0[2]
+; CHECK-NEXT:    vmov.32 q3[2], r2
+; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vmov.32 q4[2], r3
 ; CHECK-NEXT:    vand q3, q3, q2
 ; CHECK-NEXT:    vand q4, q4, q2
 ; CHECK-NEXT:    vmov r2, s14
 ; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov.u8 r4, q0[2]
 ; CHECK-NEXT:    umull r12, lr, r3, r2
 ; CHECK-NEXT:    vmov r3, s16
 ; CHECK-NEXT:    vmov r2, s12
@@ -1712,13 +1679,6 @@
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI51_0:
-; CHECK-NEXT:    .long 255 @ 0xff
-; CHECK-NEXT:    .long 0 @ 0x0
-; CHECK-NEXT:    .long 255 @ 0xff
-; CHECK-NEXT:    .long 0 @ 0x0
 entry:
   %xx = zext <16 x i8> %x to <16 x i64>
   %yy = zext <16 x i8> %y to <16 x i64>
@@ -1905,8 +1865,7 @@
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    adr r2, .LCPI53_0
-; CHECK-NEXT:    vldrw.u32 q2, [r2]
+; CHECK-NEXT:    vmov.i64 q2, #0xff
 ; CHECK-NEXT:    vand q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vmov r2, s6
@@ -1920,13 +1879,6 @@
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adcs r1, r3
 ; CHECK-NEXT:    pop {r7, pc}
-; CHECK-NEXT:    .p2align 4
-; CHECK-NEXT:  @ %bb.1:
-; CHECK-NEXT:  .LCPI53_0:
-; CHECK-NEXT:    .long 255 @ 0xff
-; CHECK-NEXT:    .long 0 @ 0x0
-; CHECK-NEXT:    .long 255 @ 0xff
-; CHECK-NEXT:    .long 0 @ 0x0
 entry:
   %xx = zext <2 x i8> %x to <2 x i64>
   %yy = zext <2 x i8> %y to <2 x i64>