Implement vector intrinsics for ARM32.

MultiplyAddPairs is implemented using VMULL+VPADD.
MultiplyHighSigned/Unsigned is implemented using VMULL+VSHRN.
SubVectorLoad/Store is implemented using VLDR/VLD1/VSTR/VST1.
VectorPackSigned/Unsigned is implemented using two VQMOVN.

Bug b/37496078
Bug b/37496856
Bug b/37496321
Bug b/37496082

Change-Id: I141fd901d53da24ce780f503dc7ad17b94fc6ba8
Reviewed-on: https://chromium-review.googlesource.com/693049
Reviewed-by: Jim Stichnoth <stichnot@chromium.org>
Reviewed-on: https://swiftshader-review.googlesource.com/12709
Reviewed-by: Nicolas Capens <nicolascapens@google.com>
Tested-by: Nicolas Capens <nicolascapens@google.com>
diff --git a/third_party/subzero/src/IceAssemblerARM32.cpp b/third_party/subzero/src/IceAssemblerARM32.cpp
index 2f1fa3c..a8e9021 100644
--- a/third_party/subzero/src/IceAssemblerARM32.cpp
+++ b/third_party/subzero/src/IceAssemblerARM32.cpp
@@ -606,6 +606,25 @@
                              "=pc not allowed when CC=1");
 }
 
+enum SIMDShiftType { ST_Vshl, ST_Vshr };
+
+IValueT encodeSIMDShiftImm6(SIMDShiftType Shift, Type ElmtTy,
+                            const IValueT Imm) {
+  assert(Imm > 0);
+  const SizeT MaxShift = getScalarIntBitWidth(ElmtTy);
+  assert(Imm < 2 * MaxShift);
+  assert(ElmtTy == IceType_i8 || ElmtTy == IceType_i16 ||
+         ElmtTy == IceType_i32);
+  const IValueT VshlImm = Imm - MaxShift;
+  const IValueT VshrImm = 2 * MaxShift - Imm;
+  return ((Shift == ST_Vshl) ? VshlImm : VshrImm) & (2 * MaxShift - 1);
+}
+
+IValueT encodeSIMDShiftImm6(SIMDShiftType Shift, Type ElmtTy,
+                            const ConstantInteger32 *Imm6) {
+  const IValueT Imm = Imm6->getValue();
+  return encodeSIMDShiftImm6(Shift, ElmtTy, Imm);
+}
 } // end of anonymous namespace
 
 namespace Ice {
@@ -2838,6 +2857,31 @@
   emitInst(Encoding);
 }
 
+void AssemblerARM32::vldrq(const Operand *OpQd, const Operand *OpAddress,
+                           CondARM32::Cond Cond, const TargetInfo &TInfo) {
+  // This is a pseudo-instruction which loads 64-bit data into a quadword
+  // vector register. It is implemented by loading into the lower doubleword.
+
+  // VLDR - ARM section A8.8.333, encoding A1.
+  //   vldr<c> <Dd>, [<Rn>{, #+/-<imm>}]
+  //
+  // cccc1101UD01nnnndddd1011iiiiiiii where cccc=Cond, nnnn=Rn, Ddddd=Rd,
+  // iiiiiiii=abs(Imm >> 2), and U=1 if Opcode>=0.
+  constexpr const char *Vldrd = "vldrd";
+  IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vldrd));
+  assert(CondARM32::isDefined(Cond));
+  IValueT Address;
+  EncodedOperand AddressEncoding =
+      encodeAddress(OpAddress, Address, TInfo, RotatedImm8Div4Address);
+  (void)AddressEncoding;
+  assert(AddressEncoding == EncodedAsImmRegOffset);
+  IValueT Encoding = B27 | B26 | B24 | B20 | B11 | B9 | B8 |
+                     (encodeCondition(Cond) << kConditionShift) |
+                     (getYInRegYXXXX(Dd) << 22) |
+                     (getXXXXInRegYXXXX(Dd) << 12) | Address;
+  emitInst(Encoding);
+}
+
 void AssemblerARM32::vldrs(const Operand *OpSd, const Operand *OpAddress,
                            CondARM32::Cond Cond, const TargetInfo &TInfo) {
   // VDLR - ARM section A8.8.333, encoding A2.
@@ -2893,6 +2937,38 @@
   emitInst(Encoding);
 }
 
+void AssemblerARM32::emitVMem1Op(IValueT Opcode, IValueT Dd, IValueT Rn,
+                                 IValueT Rm, size_t ElmtSize, IValueT Align,
+                                 const char *InstName) {
+  assert(Utils::IsAbsoluteUint(2, Align));
+  IValueT EncodedElmtSize;
+  switch (ElmtSize) {
+  default: {
+    std::string Buffer;
+    llvm::raw_string_ostream StrBuf(Buffer);
+    StrBuf << InstName << ": found invalid vector element size " << ElmtSize;
+    llvm::report_fatal_error(StrBuf.str());
+  }
+  case 8:
+    EncodedElmtSize = 0;
+    break;
+  case 16:
+    EncodedElmtSize = 1;
+    break;
+  case 32:
+    EncodedElmtSize = 2;
+    break;
+  case 64:
+    EncodedElmtSize = 3;
+  }
+  const IValueT Encoding =
+      Opcode | (encodeCondition(CondARM32::kNone) << kConditionShift) |
+      (getYInRegYXXXX(Dd) << 22) | (Rn << kRnShift) |
+      (getXXXXInRegYXXXX(Dd) << kRdShift) | (EncodedElmtSize << 10) |
+      (Align << 4) | Rm;
+  emitInst(Encoding);
+}
+
 void AssemblerARM32::vld1qr(size_t ElmtSize, const Operand *OpQd,
                             const Operand *OpAddress, const TargetInfo &TInfo) {
   // VLD1 (multiple single elements) - ARM section A8.8.320, encoding A1:
@@ -2915,6 +2991,36 @@
   emitVMem1Op(Opcode, Dd, Rn, Rm, DRegListSize2, ElmtSize, Align, Vld1qr);
 }
 
+void AssemblerARM32::vld1(size_t ElmtSize, const Operand *OpQd,
+                          const Operand *OpAddress, const TargetInfo &TInfo) {
+  // This is a pseudo-instruction for loading a single element of a quadword
+  // vector. For 64-bit the lower doubleword vector is loaded.
+
+  if (ElmtSize == 64) {
+    return vldrq(OpQd, OpAddress, Ice::CondARM32::AL, TInfo);
+  }
+
+  // VLD1 (single elements to one lane) - ARMv7-A/R section A8.6.308, encoding
+  // A1:
+  //   VLD1<c>.<size> <list>, [<Rn>{@<align>}], <Rm>
+  //
+  // 111101001D10nnnnddddss00aaaammmm where tttt=DRegListSize2, Dddd=Qd,
+  // nnnn=Rn, aa=0 (use default alignment), size=ElmtSize, and ss is the
+  // encoding of ElmtSize.
+  constexpr const char *Vld1qr = "vld1qr";
+  const IValueT Qd = encodeQRegister(OpQd, "Qd", Vld1qr);
+  const IValueT Dd = mapQRegToDReg(Qd);
+  IValueT Address;
+  if (encodeAddress(OpAddress, Address, TInfo, NoImmOffsetAddress) !=
+      EncodedAsImmRegOffset)
+    llvm::report_fatal_error(std::string(Vld1qr) + ": malform memory address");
+  const IValueT Rn = mask(Address, kRnShift, 4);
+  constexpr IValueT Rm = RegARM32::Reg_pc;
+  constexpr IValueT Opcode = B26 | B23 | B21;
+  constexpr IValueT Align = 0; // use default alignment.
+  emitVMem1Op(Opcode, Dd, Rn, Rm, ElmtSize, Align, Vld1qr);
+}
+
 bool AssemblerARM32::vmovqc(const Operand *OpQd, const ConstantInteger32 *Imm) {
   // VMOV (immediate) - ARM section A8.8.320, encoding A1:
   //   VMOV.<dt> <Qd>, #<Imm>
@@ -3226,6 +3332,92 @@
   emitSIMDqqq(VmulqiOpcode, ElmtTy, OpQd, OpQn, OpQm, Vmulqi);
 }
 
+void AssemblerARM32::vmulh(Type ElmtTy, const Operand *OpQd,
+                           const Operand *OpQn, const Operand *OpQm,
+                           bool Unsigned) {
+  // Pseudo-instruction for multiplying the corresponding elements in the lower
+  // halves of two quadword vectors, and returning the high halves.
+
+  // VMULL (integer and polynomial) - ARMv7-A/R section A8.6.337, encoding A1:
+  //   VMUL<c>.<dt> <Dd>, <Dn>, <Dm>
+  //
+  // 1111001U1Dssnnnndddd11o0N0M0mmmm
+  assert(isScalarIntegerType(ElmtTy) &&
+         "vmull expects vector with integer element type");
+  assert(ElmtTy != IceType_i64 && "vmull on i64 vector not allowed");
+  constexpr const char *Vmull = "vmull";
+
+  constexpr IValueT ElmtShift = 20;
+  const IValueT ElmtSize = encodeElmtType(ElmtTy);
+  assert(Utils::IsUint(2, ElmtSize));
+
+  const IValueT VmullOpcode =
+      B25 | (Unsigned ? B24 : 0) | B23 | (B20) | B11 | B10;
+
+  const IValueT Qd = encodeQRegister(OpQd, "Qd", Vmull);
+  const IValueT Qn = encodeQRegister(OpQn, "Qn", Vmull);
+  const IValueT Qm = encodeQRegister(OpQm, "Qm", Vmull);
+
+  const IValueT Dd = mapQRegToDReg(Qd);
+  const IValueT Dn = mapQRegToDReg(Qn);
+  const IValueT Dm = mapQRegToDReg(Qm);
+
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloatTy = false;
+  emitSIMDBase(VmullOpcode | (ElmtSize << ElmtShift), Dd, Dn, Dm, UseQRegs,
+               IsFloatTy);
+
+  // Shift and narrow to obtain high halves.
+  constexpr IValueT VshrnOpcode = B25 | B23 | B11 | B4;
+  const IValueT Imm6 = encodeSIMDShiftImm6(ST_Vshr, IceType_i16, 16);
+  constexpr IValueT ImmShift = 16;
+
+  emitSIMDBase(VshrnOpcode | (Imm6 << ImmShift), Dd, 0, Dd, UseQRegs,
+               IsFloatTy);
+}
+
+void AssemblerARM32::vmlap(Type ElmtTy, const Operand *OpQd,
+                           const Operand *OpQn, const Operand *OpQm) {
+  // Pseudo-instruction for multiplying the corresponding elements in the lower
+  // halves of two quadword vectors, and pairwise-adding the results.
+
+  // VMULL (integer and polynomial) - ARM section A8.8.350, encoding A1:
+  //   vmull<c>.<dt> <Qd>, <Qn>, <Qm>
+  //
+  // 1111001U1Dssnnnndddd11o0N0M0mmmm
+  assert(isScalarIntegerType(ElmtTy) &&
+         "vmull expects vector with integer element type");
+  assert(ElmtTy != IceType_i64 && "vmull on i64 vector not allowed");
+  constexpr const char *Vmull = "vmull";
+
+  constexpr IValueT ElmtShift = 20;
+  const IValueT ElmtSize = encodeElmtType(ElmtTy);
+  assert(Utils::IsUint(2, ElmtSize));
+
+  bool Unsigned = false;
+  const IValueT VmullOpcode =
+      B25 | (Unsigned ? B24 : 0) | B23 | (B20) | B11 | B10;
+
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmull));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmull));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmull));
+
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloatTy = false;
+  emitSIMDBase(VmullOpcode | (ElmtSize << ElmtShift), Dd, Dn, Dm, UseQRegs,
+               IsFloatTy);
+
+  // VPADD - ARM section A8.8.280, encoding A1:
+  //   vpadd.<dt> <Dd>, <Dm>, <Dn>
+  //
+  // 111100100Dssnnnndddd1011NQM1mmmm where Ddddd=<Dd>, Mmmmm=<Dm>, and
+  // Nnnnn=<Dn> and ss is the encoding of <dt>.
+  assert(ElmtTy != IceType_i64 && "vpadd doesn't allow i64!");
+  const IValueT VpaddOpcode =
+      B25 | B11 | B9 | B8 | B4 | ((encodeElmtType(ElmtTy) + 1) << 20);
+  emitSIMDBase(VpaddOpcode, Dd, Dd, Dd + 1, UseQRegs, IsFloatTy);
+}
+
 void AssemblerARM32::vmulqf(const Operand *OpQd, const Operand *OpQn,
                             const Operand *OpQm) {
   // VMUL (floating-point) - ARM section A8.8.351, encoding A1:
@@ -3314,6 +3506,31 @@
   emitInst(Encoding);
 }
 
+void AssemblerARM32::vstrq(const Operand *OpQd, const Operand *OpAddress,
+                           CondARM32::Cond Cond, const TargetInfo &TInfo) {
+  // This is a pseudo-instruction which stores 64-bit data into a quadword
+  // vector register. It is implemented by storing into the lower doubleword.
+
+  // VSTR - ARM section A8.8.413, encoding A1:
+  //   vstr<c> <Dd>, [<Rn>{, #+/-<Imm>}]
+  //
+  // cccc1101UD00nnnndddd1011iiiiiiii where cccc=Cond, nnnn=Rn, Ddddd=Rd,
+  // iiiiiiii=abs(Imm >> 2), and U=1 if Imm>=0.
+  constexpr const char *Vstrd = "vstrd";
+  IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Dd", Vstrd));
+  assert(CondARM32::isDefined(Cond));
+  IValueT Address;
+  IValueT AddressEncoding =
+      encodeAddress(OpAddress, Address, TInfo, RotatedImm8Div4Address);
+  (void)AddressEncoding;
+  assert(AddressEncoding == EncodedAsImmRegOffset);
+  IValueT Encoding = B27 | B26 | B24 | B11 | B9 | B8 |
+                     (encodeCondition(Cond) << kConditionShift) |
+                     (getYInRegYXXXX(Dd) << 22) |
+                     (getXXXXInRegYXXXX(Dd) << 12) | Address;
+  emitInst(Encoding);
+}
+
 void AssemblerARM32::vstrs(const Operand *OpSd, const Operand *OpAddress,
                            CondARM32::Cond Cond, const TargetInfo &TInfo) {
   // VSTR - ARM section A8.8.413, encoding A2:
@@ -3357,6 +3574,37 @@
   emitVMem1Op(Opcode, Dd, Rn, Rm, DRegListSize2, ElmtSize, Align, Vst1qr);
 }
 
+void AssemblerARM32::vst1(size_t ElmtSize, const Operand *OpQd,
+                          const Operand *OpAddress, const TargetInfo &TInfo) {
+
+  // This is a pseudo-instruction for storing a single element of a quadword
+  // vector. For 64-bit the lower doubleword vector is stored.
+
+  if (ElmtSize == 64) {
+    return vstrq(OpQd, OpAddress, Ice::CondARM32::AL, TInfo);
+  }
+
+  // VST1 (single element from one lane) - ARMv7-A/R section A8.6.392, encoding
+  // A1:
+  //   VST1<c>.<size> <list>, [<Rn>{@<align>}], <Rm>
+  //
+  // 111101001D00nnnnddd0ss00aaaammmm where Dddd=Qd, nnnn=Rn,
+  // aaaa=0 (use default alignment), size=ElmtSize, and ss is the
+  // encoding of ElmtSize.
+  constexpr const char *Vst1qr = "vst1qr";
+  const IValueT Qd = encodeQRegister(OpQd, "Qd", Vst1qr);
+  const IValueT Dd = mapQRegToDReg(Qd);
+  IValueT Address;
+  if (encodeAddress(OpAddress, Address, TInfo, NoImmOffsetAddress) !=
+      EncodedAsImmRegOffset)
+    llvm::report_fatal_error(std::string(Vst1qr) + ": malform memory address");
+  const IValueT Rn = mask(Address, kRnShift, 4);
+  constexpr IValueT Rm = RegARM32::Reg_pc;
+  constexpr IValueT Opcode = B26 | B23;
+  constexpr IValueT Align = 0; // use default alignment.
+  emitVMem1Op(Opcode, Dd, Rn, Rm, ElmtSize, Align, Vst1qr);
+}
+
 void AssemblerARM32::vsubs(const Operand *OpSd, const Operand *OpSn,
                            const Operand *OpSm, CondARM32::Cond Cond) {
   // VSUB (floating-point) - ARM section A8.8.415, encoding A2:
@@ -3451,6 +3699,60 @@
   emitSIMDqqq(VsubqiOpcode, ElmtTy, OpQd, OpQm, OpQn, Vsubqi);
 }
 
+void AssemblerARM32::vqmovn2(Type DestElmtTy, const Operand *OpQd,
+                             const Operand *OpQm, const Operand *OpQn,
+                             bool Unsigned, bool Saturating) {
+  // Pseudo-instruction for packing two quadword vectors into one quadword
+  // vector, narrowing each element using saturation or truncation.
+
+  // VQMOVN - ARMv7-A/R section A8.6.361, encoding A1:
+  //   V{Q}MOVN{U}N<c>.<type><size> <Dd>, <Qm>
+  //
+  // 111100111D11ss10dddd0010opM0mmm0 where Ddddd=OpQd, op = 10, Mmmm=OpQm,
+  // ss is 00 (16-bit), 01 (32-bit), or 10 (64-bit).
+
+  assert(DestElmtTy != IceType_i64 &&
+         "vmovn doesn't allow i64 destination vector elements!");
+
+  constexpr const char *Vqmovn = "vqmovn";
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloatTy = false;
+  const IValueT Qd = encodeQRegister(OpQd, "Qd", Vqmovn);
+  const IValueT Qm = encodeQRegister(OpQm, "Qm", Vqmovn);
+  const IValueT Qn = encodeQRegister(OpQn, "Qn", Vqmovn);
+  const IValueT Dd = mapQRegToDReg(Qd);
+  const IValueT Dm = mapQRegToDReg(Qm);
+  const IValueT Dn = mapQRegToDReg(Qn);
+
+  IValueT VqmovnOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B9 |
+                         (Saturating ? (Unsigned ? B6 : B7) : 0);
+
+  constexpr IValueT ElmtShift = 18;
+  VqmovnOpcode |= (encodeElmtType(DestElmtTy) << ElmtShift);
+
+  if (Qm != Qd) {
+    // Narrow first source operand to lower half of destination.
+    emitSIMDBase(VqmovnOpcode, Dd + 0, 0, Dm, UseQRegs, IsFloatTy);
+    // Narrow second source operand to upper half of destination.
+    emitSIMDBase(VqmovnOpcode, Dd + 1, 0, Dn, UseQRegs, IsFloatTy);
+  } else if (Qn != Qd) {
+    // Narrow second source operand to upper half of destination.
+    emitSIMDBase(VqmovnOpcode, Dd + 1, 0, Dn, UseQRegs, IsFloatTy);
+    // Narrow first source operand to lower half of destination.
+    emitSIMDBase(VqmovnOpcode, Dd + 0, 0, Dm, UseQRegs, IsFloatTy);
+
+  } else {
+    // Narrow first source operand to lower half of destination.
+    emitSIMDBase(VqmovnOpcode, Dd, 0, Dm, UseQRegs, IsFloatTy);
+
+    // VMOV Dd, Dm
+    // 111100100D10mmmmdddd0001MQM1mmmm
+    const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+
+    emitSIMDBase(VmovOpcode, Dd + 1, Dd, Dd, UseQRegs, IsFloatTy);
+  }
+}
+
 void AssemblerARM32::vsubqf(const Operand *OpQd, const Operand *OpQn,
                             const Operand *OpQm) {
   // VSUB (floating-point) - ARM section A8.8.415, Encoding A1:
@@ -3523,22 +3825,6 @@
   emitSIMDqqq(VshlOpcode, ElmtTy, OpQd, OpQn, OpQm, Vshl);
 }
 
-namespace {
-enum SIMDShiftType { ST_Vshl, ST_Vshr };
-IValueT encodeSIMDShiftImm6(SIMDShiftType Shift, Type ElmtTy,
-                            const ConstantInteger32 *Imm6) {
-  const IValueT Imm = Imm6->getValue();
-  assert(Imm > 0);
-  const SizeT MaxShift = getScalarIntBitWidth(ElmtTy);
-  assert(Imm < MaxShift);
-  assert(ElmtTy == IceType_i8 || ElmtTy == IceType_i16 ||
-         ElmtTy == IceType_i32);
-  const IValueT VshlImm = Imm - MaxShift;
-  const IValueT VshrImm = 2 * MaxShift - Imm;
-  return ((Shift == ST_Vshl) ? VshlImm : VshrImm) & (2 * MaxShift - 1);
-}
-} // end of anonymous namespace
-
 void AssemblerARM32::vshlqc(Type ElmtTy, const Operand *OpQd,
                             const Operand *OpQm,
                             const ConstantInteger32 *Imm6) {
diff --git a/third_party/subzero/src/IceAssemblerARM32.h b/third_party/subzero/src/IceAssemblerARM32.h
index a7e8481..1f80043 100644
--- a/third_party/subzero/src/IceAssemblerARM32.h
+++ b/third_party/subzero/src/IceAssemblerARM32.h
@@ -440,16 +440,34 @@
     vldrs(OpSd, OpAddress, Cond, TInfo);
   }
 
+  void vldrq(const Operand *OpQd, const Operand *OpAddress,
+             CondARM32::Cond Cond, const TargetInfo &TInfo);
+
+  void vldrq(const Operand *OpQd, const Operand *OpAddress,
+             CondARM32::Cond Cond, const TargetLowering *Lowering) {
+    const TargetInfo TInfo(Lowering);
+    vldrq(OpQd, OpAddress, Cond, TInfo);
+  }
+
   // ElmtSize = #bits in vector element.
   void vld1qr(size_t ElmtSize, const Operand *OpQd, const Operand *OpRn,
               const TargetInfo &TInfo);
 
+  void vld1(size_t ElmtSize, const Operand *OpQd, const Operand *OpRn,
+            const TargetInfo &TInfo);
+
   void vld1qr(size_t ElmtSize, const Operand *OpQd, const Operand *OpRn,
               const TargetLowering *Lowering) {
     const TargetInfo TInfo(Lowering);
     vld1qr(ElmtSize, OpQd, OpRn, TInfo);
   }
 
+  void vld1(size_t ElmtSize, const Operand *OpQd, const Operand *OpRn,
+            const TargetLowering *Lowering) {
+    const TargetInfo TInfo(Lowering);
+    vld1(ElmtSize, OpQd, OpRn, TInfo);
+  }
+
   // Qn[i] = Imm for all i in vector. Returns true iff Imm can be defined as an
   // Imm8 using AdvSIMDExpandImm().
   bool vmovqc(const Operand *OpQd, const ConstantInteger32 *Imm);
@@ -520,6 +538,14 @@
   void vmulqi(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
               const Operand *OpQm);
 
+  // Integer vector multiply high.
+  void vmulh(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+             const Operand *OpQm, bool Unsigned);
+
+  // Integer vector multiply add pairwise.
+  void vmlap(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+             const Operand *OpQm);
+
   // Float vector multiply.
   void vmulqf(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
 
@@ -574,6 +600,15 @@
     vstrs(OpSd, OpAddress, Cond, TInfo);
   }
 
+  void vstrq(const Operand *OpQd, const Operand *OpAddress,
+             CondARM32::Cond Cond, const TargetInfo &TInfo);
+
+  void vstrq(const Operand *OpQd, const Operand *OpAddress,
+             CondARM32::Cond Cond, const TargetLowering *Lowering) {
+    const TargetInfo TInfo(Lowering);
+    vstrq(OpQd, OpAddress, Cond, TInfo);
+  }
+
   // ElmtSize = #bits in vector element.
   void vst1qr(size_t ElmtSize, const Operand *OpQd, const Operand *OpAddress,
               const TargetInfo &TInfo);
@@ -584,6 +619,15 @@
     vst1qr(ElmtSize, OpQd, OpRn, TInfo);
   }
 
+  void vst1(size_t ElmtSize, const Operand *OpQd, const Operand *OpAddress,
+            const TargetInfo &TInfo);
+
+  void vst1(size_t ElmtSize, const Operand *OpQd, const Operand *OpRn,
+            const TargetLowering *Lowering) {
+    const TargetInfo TInfo(Lowering);
+    vst1(ElmtSize, OpQd, OpRn, TInfo);
+  }
+
   void vsubd(const Operand *OpDd, const Operand *OpDn, const Operand *OpDm,
              CondARM32::Cond Cond);
 
@@ -603,6 +647,10 @@
   void vqaddqu(Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
                const Operand *OpQn);
 
+  // Integer vector packing with optional saturation.
+  void vqmovn2(Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
+               const Operand *OpQn, bool Unsigned, bool Saturating);
+
   // Float vector subtract
   void vsubqf(const Operand *OpQd, const Operand *OpQm, const Operand *OpQn);
 
@@ -732,6 +780,11 @@
                    DRegListSize NumDRegs, size_t ElmtSize, IValueT Align,
                    const char *InstName);
 
+  // Pattern 111100000D00nnnnddddss00aaaammmm | Opcode where Ddddd=Dd, nnnn=Rn,
+  // mmmmm=Rm, ElmtSize in {8, 16, 32) and defines ss, and aa=Align.
+  void emitVMem1Op(IValueT Opcode, IValueT Dd, IValueT Rn, IValueT Rm,
+                   size_t ElmtSize, IValueT Align, const char *InstName);
+
   // Pattern cccc011100x1dddd1111mmmm0001nnn where cccc=Cond,
   // x=Opcode, dddd=Rd, nnnn=Rn, mmmm=Rm.
   void emitDivOp(CondARM32::Cond Cond, IValueT Opcode, IValueT Rd, IValueT Rn,
diff --git a/third_party/subzero/src/IceInstARM32.cpp b/third_party/subzero/src/IceInstARM32.cpp
index 043f4a6..2f12b85 100644
--- a/third_party/subzero/src/IceInstARM32.cpp
+++ b/third_party/subzero/src/IceInstARM32.cpp
@@ -1087,6 +1087,87 @@
   assert(!Asm->needsTextFixup());
 }
 
+template <> void InstARM32Vqmovn2::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Operand *Src0 = getSrc(0);
+  const Operand *Src1 = getSrc(1);
+  Type SrcTy = Src0->getType();
+  Type DestTy = Dest->getType();
+  bool Unsigned = true;
+  bool Saturating = true;
+  switch (SrcTy) {
+  default:
+    llvm::report_fatal_error("Vqmovn2 not defined on type " +
+                             typeStdString(SrcTy));
+  case IceType_v8i16:
+  case IceType_v4i32:
+    switch (Sign) {
+    case InstARM32::FS_None:
+      Unsigned = true;
+      Saturating = false;
+      Asm->vqmovn2(typeElementType(DestTy), Dest, Src0, Src1, Unsigned,
+                   Saturating);
+      break;
+    case InstARM32::FS_Unsigned:
+      Unsigned = true;
+      Saturating = true;
+      Asm->vqmovn2(typeElementType(DestTy), Dest, Src0, Src1, Unsigned,
+                   Saturating);
+      break;
+    case InstARM32::FS_Signed:
+      Unsigned = false;
+      Saturating = true;
+      Asm->vqmovn2(typeElementType(DestTy), Dest, Src0, Src1, Unsigned,
+                   Saturating);
+      break;
+    }
+    break;
+  }
+  assert(!Asm->needsTextFixup());
+}
+
+template <> void InstARM32Vmulh::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Operand *Src0 = getSrc(0);
+  Type SrcTy = Src0->getType();
+  bool Unsigned = true;
+  switch (SrcTy) {
+  default:
+    llvm::report_fatal_error("Vmulh not defined on type " +
+                             typeStdString(SrcTy));
+  case IceType_v8i16:
+    switch (Sign) {
+    case InstARM32::FS_None: // defaults to unsigned.
+    case InstARM32::FS_Unsigned:
+      Unsigned = true;
+      Asm->vmulh(typeElementType(SrcTy), Dest, getSrc(0), getSrc(1), Unsigned);
+      break;
+    case InstARM32::FS_Signed:
+      Unsigned = false;
+      Asm->vmulh(typeElementType(SrcTy), Dest, getSrc(0), getSrc(1), Unsigned);
+      break;
+    }
+    break;
+  }
+  assert(!Asm->needsTextFixup());
+}
+
+template <> void InstARM32Vmlap::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Operand *Src0 = getSrc(0);
+  const Operand *Src1 = getSrc(1);
+  Type SrcTy = Src0->getType();
+  switch (SrcTy) {
+  default:
+    llvm::report_fatal_error("Vmlap not defined on type " +
+                             typeStdString(SrcTy));
+  case IceType_v8i16:
+    Asm->vmlap(typeElementType(SrcTy), Dest, Src0, Src1);
+    break;
+  }
+  assert(!Asm->needsTextFixup());
+}
+
 template <> void InstARM32Vmul::emitIAS(const Cfg *Func) const {
   auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
   const Variable *Dest = getDest();
@@ -1336,6 +1417,14 @@
   addSource(Mem);
 }
 
+InstARM32Vstr1::InstARM32Vstr1(Cfg *Func, Variable *Value, OperandARM32Mem *Mem,
+                               CondARM32::Cond Predicate, SizeT Size)
+    : InstARM32Pred(Func, InstARM32::Vstr1, 2, nullptr, Predicate) {
+  addSource(Value);
+  addSource(Mem);
+  this->Size = Size;
+}
+
 InstARM32Trap::InstARM32Trap(Cfg *Func)
     : InstARM32(Func, InstARM32::Trap, 0, nullptr) {}
 
@@ -1654,6 +1743,8 @@
 // Mov-like ops
 template <> const char *InstARM32Ldr::Opcode = "ldr";
 template <> const char *InstARM32Ldrex::Opcode = "ldrex";
+template <> const char *InstARM32Vldr1d::Opcode = "vldr1d";
+template <> const char *InstARM32Vldr1q::Opcode = "vldr1q";
 // Three-addr ops
 template <> const char *InstARM32Adc::Opcode = "adc";
 template <> const char *InstARM32Add::Opcode = "add";
@@ -1693,6 +1784,12 @@
 const char *InstARM32ThreeAddrFP<InstARM32::Vqadd>::Opcode = "vqadd";
 template <>
 const char *InstARM32ThreeAddrFP<InstARM32::Vqsub>::Opcode = "vqsub";
+template <>
+const char *InstARM32ThreeAddrFP<InstARM32::Vqmovn2>::Opcode = "vqmovn2";
+template <>
+const char *InstARM32ThreeAddrFP<InstARM32::Vmulh>::Opcode = "vmulh";
+template <>
+const char *InstARM32ThreeAddrFP<InstARM32::Vmlap>::Opcode = "vmlap";
 // Four-addr ops
 template <> const char *InstARM32Mla::Opcode = "mla";
 template <> const char *InstARM32Mls::Opcode = "mls";
@@ -2154,6 +2251,62 @@
   getSrc(0)->emit(Func);
 }
 
+template <> void InstARM32Vldr1d::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  assert(getDest()->hasReg());
+  Variable *Dest = getDest();
+  Type Ty = Dest->getType();
+  const bool IsVector = isVectorType(Ty);
+  const bool IsScalarFloat = isScalarFloatingType(Ty);
+  const char *ActualOpcode =
+      IsVector ? "vld1" : (IsScalarFloat ? "vldr" : "ldr");
+  const char *WidthString = IsVector ? "" : getWidthString(Ty);
+  Str << "\t" << ActualOpcode;
+  const bool IsVInst = IsVector || IsScalarFloat;
+  if (IsVInst) {
+    Str << getPredicate() << WidthString;
+  } else {
+    Str << WidthString << getPredicate();
+  }
+  if (IsVector)
+    Str << "." << getVecElmtBitsize(Ty);
+  Str << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+}
+
+template <> void InstARM32Vldr1q::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  assert(getDest()->hasReg());
+  Variable *Dest = getDest();
+  Type Ty = Dest->getType();
+  const bool IsVector = isVectorType(Ty);
+  const bool IsScalarFloat = isScalarFloatingType(Ty);
+  const char *ActualOpcode =
+      IsVector ? "vld1" : (IsScalarFloat ? "vldr" : "ldr");
+  const char *WidthString = IsVector ? "" : getWidthString(Ty);
+  Str << "\t" << ActualOpcode;
+  const bool IsVInst = IsVector || IsScalarFloat;
+  if (IsVInst) {
+    Str << getPredicate() << WidthString;
+  } else {
+    Str << WidthString << getPredicate();
+  }
+  if (IsVector)
+    Str << "." << getVecElmtBitsize(Ty);
+  Str << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+}
+
 template <> void InstARM32Ldr::emitIAS(const Cfg *Func) const {
   assert(getSrcSize() == 1);
   auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
@@ -2187,6 +2340,20 @@
   }
 }
 
+template <> void InstARM32Vldr1d::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 1);
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  Variable *Dest = getDest();
+  Asm->vld1(32, Dest, getSrc(0), Func->getTarget());
+}
+
+template <> void InstARM32Vldr1q::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 1);
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  Variable *Dest = getDest();
+  Asm->vld1(64, Dest, getSrc(0), Func->getTarget());
+}
+
 template <> void InstARM32Ldrex::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
@@ -2593,6 +2760,51 @@
   getSrc(0)->dump(Func);
 }
 
+void InstARM32Vstr1::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Type Ty = getSrc(0)->getType();
+  const bool IsVectorStore = isVectorType(Ty);
+  const bool IsScalarFloat = isScalarFloatingType(Ty);
+  const char *Opcode =
+      IsVectorStore ? "vst1" : (IsScalarFloat ? "vstr" : "str");
+  Str << "\t" << Opcode;
+  const bool IsVInst = IsVectorStore || IsScalarFloat;
+  if (IsVInst) {
+    Str << getPredicate() << getWidthString(Ty);
+  } else {
+    Str << getWidthString(Ty) << getPredicate();
+  }
+  if (IsVectorStore)
+    Str << "." << getVecElmtBitsize(Ty);
+  Str << "\t";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+}
+
+void InstARM32Vstr1::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 2);
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Operand *Src0 = getSrc(0);
+  const Operand *Src1 = getSrc(1);
+  Asm->vst1(Size, Src0, Src1, Func->getTarget());
+}
+
+void InstARM32Vstr1::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Type Ty = getSrc(0)->getType();
+  dumpOpcodePred(Str, "str", Ty);
+  Str << " ";
+  getSrc(1)->dump(Func);
+  Str << ", ";
+  getSrc(0)->dump(Func);
+}
+
 void InstARM32Trap::emit(const Cfg *Func) const {
   if (!BuildDefs::dump())
     return;
@@ -3166,10 +3378,14 @@
 template class InstARM32ThreeAddrFP<InstARM32::Vsub>;
 template class InstARM32ThreeAddrSignAwareFP<InstARM32::Vqadd>;
 template class InstARM32ThreeAddrSignAwareFP<InstARM32::Vqsub>;
+template class InstARM32ThreeAddrSignAwareFP<InstARM32::Vqmovn2>;
+template class InstARM32ThreeAddrSignAwareFP<InstARM32::Vmulh>;
+template class InstARM32ThreeAddrFP<InstARM32::Vmlap>;
 
 template class InstARM32LoadBase<InstARM32::Ldr>;
 template class InstARM32LoadBase<InstARM32::Ldrex>;
-
+template class InstARM32LoadBase<InstARM32::Vldr1d>;
+template class InstARM32LoadBase<InstARM32::Vldr1q>;
 template class InstARM32TwoAddrGPR<InstARM32::Movt>;
 
 template class InstARM32UnaryopGPR<InstARM32::Movw, false>;
diff --git a/third_party/subzero/src/IceInstARM32.h b/third_party/subzero/src/IceInstARM32.h
index e1344dc..593d96d 100644
--- a/third_party/subzero/src/IceInstARM32.h
+++ b/third_party/subzero/src/IceInstARM32.h
@@ -435,18 +435,24 @@
     Vcvt,
     Vdiv,
     Veor,
+    Vldr1d,
+    Vldr1q,
     Vmla,
+    Vmlap,
     Vmls,
     Vmrs,
     Vmul,
+    Vmulh,
     Vmvn,
     Vneg,
     Vorr,
     Vqadd,
+    Vqmovn2,
     Vqsub,
     Vshl,
     Vshr,
     Vsqrt,
+    Vstr1,
     Vsub
   };
 
@@ -1020,11 +1026,16 @@
 using InstARM32Vorr = InstARM32ThreeAddrFP<InstARM32::Vorr>;
 using InstARM32Vqadd = InstARM32ThreeAddrSignAwareFP<InstARM32::Vqadd>;
 using InstARM32Vqsub = InstARM32ThreeAddrSignAwareFP<InstARM32::Vqsub>;
+using InstARM32Vqmovn2 = InstARM32ThreeAddrSignAwareFP<InstARM32::Vqmovn2>;
+using InstARM32Vmulh = InstARM32ThreeAddrSignAwareFP<InstARM32::Vmulh>;
+using InstARM32Vmlap = InstARM32ThreeAddrFP<InstARM32::Vmlap>;
 using InstARM32Vshl = InstARM32ThreeAddrSignAwareFP<InstARM32::Vshl>;
 using InstARM32Vshr = InstARM32ThreeAddrSignAwareFP<InstARM32::Vshr>;
 using InstARM32Vsub = InstARM32ThreeAddrFP<InstARM32::Vsub>;
 using InstARM32Ldr = InstARM32LoadBase<InstARM32::Ldr>;
 using InstARM32Ldrex = InstARM32LoadBase<InstARM32::Ldrex>;
+using InstARM32Vldr1d = InstARM32LoadBase<InstARM32::Vldr1d>;
+using InstARM32Vldr1q = InstARM32LoadBase<InstARM32::Vldr1q>;
 /// MovT leaves the bottom bits alone so dest is also a source. This helps
 /// indicate that a previous MovW setting dest is not dead code.
 using InstARM32Movt = InstARM32TwoAddrGPR<InstARM32::Movt>;
@@ -1336,6 +1347,33 @@
                  OperandARM32Mem *Mem, CondARM32::Cond Predicate);
 };
 
+/// Sub-vector store instruction. It's important for liveness that there is no
+///  Dest operand (OperandARM32Mem instead of Dest Variable).
+class InstARM32Vstr1 final : public InstARM32Pred {
+  InstARM32Vstr1() = delete;
+  InstARM32Vstr1(const InstARM32Vstr1 &) = delete;
+  InstARM32Vstr1 &operator=(const InstARM32Vstr1 &) = delete;
+
+public:
+  /// Value must be a register.
+  static InstARM32Vstr1 *create(Cfg *Func, Variable *Value,
+                                OperandARM32Mem *Mem, CondARM32::Cond Predicate,
+                                SizeT Size) {
+    return new (Func->allocate<InstARM32Vstr1>())
+        InstARM32Vstr1(Func, Value, Mem, Predicate, Size);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) { return isClassof(Instr, Vstr1); }
+
+private:
+  InstARM32Vstr1(Cfg *Func, Variable *Value, OperandARM32Mem *Mem,
+                 CondARM32::Cond Predicate, SizeT Size);
+
+  SizeT Size;
+};
+
 class InstARM32Trap : public InstARM32 {
   InstARM32Trap() = delete;
   InstARM32Trap(const InstARM32Trap &) = delete;
@@ -1630,6 +1668,8 @@
 template <> void InstARM32Ldr::emit(const Cfg *Func) const;
 template <> void InstARM32Movw::emit(const Cfg *Func) const;
 template <> void InstARM32Movt::emit(const Cfg *Func) const;
+template <> void InstARM32Vldr1d::emit(const Cfg *Func) const;
+template <> void InstARM32Vldr1q::emit(const Cfg *Func) const;
 
 } // end of namespace ARM32
 } // end of namespace Ice
diff --git a/third_party/subzero/src/IceTargetLoweringARM32.cpp b/third_party/subzero/src/IceTargetLoweringARM32.cpp
index 65dca3a..9856f7a 100644
--- a/third_party/subzero/src/IceTargetLoweringARM32.cpp
+++ b/third_party/subzero/src/IceTargetLoweringARM32.cpp
@@ -5331,23 +5331,75 @@
     return;
   }
   case Intrinsics::LoadSubVector: {
-    UnimplementedLoweringError(this, Instr);
+    assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
+           "LoadSubVector second argument must be a constant");
+    Variable *Dest = Instr->getDest();
+    Type Ty = Dest->getType();
+    auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
+    Operand *Addr = Instr->getArg(0);
+    OperandARM32Mem *Src = formMemoryOperand(Addr, Ty);
+    doMockBoundsCheck(Src);
+
+    if (Dest->isRematerializable()) {
+      Context.insert<InstFakeDef>(Dest);
+      return;
+    }
+
+    auto *T = makeReg(Ty);
+    switch (SubVectorSize->getValue()) {
+    case 4:
+      _vldr1d(T, Src);
+      break;
+    case 8:
+      _vldr1q(T, Src);
+      break;
+    default:
+      Func->setError("Unexpected size for LoadSubVector");
+      return;
+    }
+    _mov(Dest, T); // FIXME: necessary?
     return;
   }
   case Intrinsics::StoreSubVector: {
-    UnimplementedLoweringError(this, Instr);
+    assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
+           "StoreSubVector third argument must be a constant");
+    auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
+    Variable *Value = legalizeToReg(Instr->getArg(0));
+    Operand *Addr = Instr->getArg(1);
+    OperandARM32Mem *NewAddr = formMemoryOperand(Addr, Value->getType());
+    doMockBoundsCheck(NewAddr);
+
+    Value = legalizeToReg(Value);
+
+    switch (SubVectorSize->getValue()) {
+    case 4:
+      _vstr1d(Value, NewAddr);
+      break;
+    case 8:
+      _vstr1q(Value, NewAddr);
+      break;
+    default:
+      Func->setError("Unexpected size for StoreSubVector");
+      return;
+    }
     return;
   }
   case Intrinsics::MultiplyAddPairs: {
-    UnimplementedLoweringError(this, Instr);
+    Variable *Src0 = legalizeToReg(Instr->getArg(0));
+    Variable *Src1 = legalizeToReg(Instr->getArg(1));
+    Variable *T = makeReg(DestTy);
+    _vmlap(T, Src0, Src1);
+    _mov(Dest, T);
     return;
   }
-  case Intrinsics::MultiplyHighSigned: {
-    UnimplementedLoweringError(this, Instr);
-    return;
-  }
+  case Intrinsics::MultiplyHighSigned:
   case Intrinsics::MultiplyHighUnsigned: {
-    UnimplementedLoweringError(this, Instr);
+    bool Unsigned = (ID == Intrinsics::MultiplyHighUnsigned);
+    Variable *Src0 = legalizeToReg(Instr->getArg(0));
+    Variable *Src1 = legalizeToReg(Instr->getArg(1));
+    Variable *T = makeReg(DestTy);
+    _vmulh(T, Src0, Src1, Unsigned);
+    _mov(Dest, T);
     return;
   }
   case Intrinsics::Nearbyint: {
@@ -5372,12 +5424,15 @@
     _mov(Dest, T);
     return;
   }
-  case Intrinsics::VectorPackSigned: {
-    UnimplementedLoweringError(this, Instr);
-    return;
-  }
+  case Intrinsics::VectorPackSigned:
   case Intrinsics::VectorPackUnsigned: {
-    UnimplementedLoweringError(this, Instr);
+    bool Unsigned = (ID == Intrinsics::VectorPackUnsigned);
+    bool Saturating = true;
+    Variable *Src0 = legalizeToReg(Instr->getArg(0));
+    Variable *Src1 = legalizeToReg(Instr->getArg(1));
+    Variable *T = makeReg(DestTy);
+    _vqmovn2(T, Src0, Src1, Unsigned, Saturating);
+    _mov(Dest, T);
     return;
   }
   default: // UnknownIntrinsic
diff --git a/third_party/subzero/src/IceTargetLoweringARM32.h b/third_party/subzero/src/IceTargetLoweringARM32.h
index be848ed..a82337a 100644
--- a/third_party/subzero/src/IceTargetLoweringARM32.h
+++ b/third_party/subzero/src/IceTargetLoweringARM32.h
@@ -888,18 +888,33 @@
   void _veor(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Veor>(Dest, Src0, Src1);
   }
+  void _vldr1d(Variable *Dest, OperandARM32Mem *Addr,
+               CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert<InstARM32Vldr1d>(Dest, Addr, Pred);
+  }
+  void _vldr1q(Variable *Dest, OperandARM32Mem *Addr,
+               CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert<InstARM32Vldr1q>(Dest, Addr, Pred);
+  }
   void _vmrs(CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert<InstARM32Vmrs>(Pred);
   }
   void _vmla(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Vmla>(Dest, Src0, Src1);
   }
+  void _vmlap(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vmlap>(Dest, Src0, Src1);
+  }
   void _vmls(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Vmls>(Dest, Src0, Src1);
   }
   void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Vmul>(Dest, Src0, Src1);
   }
+  void _vmulh(Variable *Dest, Variable *Src0, Variable *Src1, bool Unsigned) {
+    Context.insert<InstARM32Vmulh>(Dest, Src0, Src1)
+        ->setSignType(Unsigned ? InstARM32::FS_Unsigned : InstARM32::FS_Signed);
+  }
   void _vmvn(Variable *Dest, Variable *Src0) {
     Context.insert<InstARM32Vmvn>(Dest, Src0, CondARM32::AL);
   }
@@ -914,6 +929,13 @@
     Context.insert<InstARM32Vqadd>(Dest, Src0, Src1)
         ->setSignType(Unsigned ? InstARM32::FS_Unsigned : InstARM32::FS_Signed);
   }
+  void _vqmovn2(Variable *Dest, Variable *Src0, Variable *Src1, bool Unsigned,
+                bool Saturating) {
+    Context.insert<InstARM32Vqmovn2>(Dest, Src0, Src1)
+        ->setSignType(Saturating ? (Unsigned ? InstARM32::FS_Unsigned
+                                             : InstARM32::FS_Signed)
+                                 : InstARM32::FS_None);
+  }
   void _vqsub(Variable *Dest, Variable *Src0, Variable *Src1, bool Unsigned) {
     Context.insert<InstARM32Vqsub>(Dest, Src0, Src1)
         ->setSignType(Unsigned ? InstARM32::FS_Unsigned : InstARM32::FS_Signed);
@@ -933,6 +955,14 @@
               CondARM32::Cond Pred = CondARM32::AL) {
     Context.insert<InstARM32Vsqrt>(Dest, Src, Pred);
   }
+  void _vstr1d(Variable *Value, OperandARM32Mem *Addr,
+               CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert<InstARM32Vstr1>(Value, Addr, Pred, 32);
+  }
+  void _vstr1q(Variable *Value, OperandARM32Mem *Addr,
+               CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert<InstARM32Vstr1>(Value, Addr, Pred, 64);
+  }
   void _vsub(Variable *Dest, Variable *Src0, Variable *Src1) {
     Context.insert<InstARM32Vsub>(Dest, Src0, Src1);
   }